• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2019 The Chromium OS Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 use std::sync::Arc;
6 use std::u32;
7 
8 use base::{
9     error, pagesize, AsRawDescriptor, Event, MappedRegion, MemoryMapping, MemoryMappingBuilder,
10     RawDescriptor, Tube,
11 };
12 use hypervisor::Datamatch;
13 
14 use resources::{Alloc, MmioType, SystemAllocator};
15 
16 use vfio_sys::*;
17 use vm_control::{VmIrqRequest, VmIrqResponse, VmMemoryRequest, VmMemoryResponse};
18 
19 use crate::pci::msix::{
20     MsixConfig, BITS_PER_PBA_ENTRY, MSIX_PBA_ENTRIES_MODULO, MSIX_TABLE_ENTRIES_MODULO,
21 };
22 
23 use crate::pci::pci_device::{Error as PciDeviceError, PciDevice};
24 use crate::pci::{PciAddress, PciClassCode, PciInterruptPin};
25 
26 use crate::vfio::{VfioDevice, VfioIrqType};
27 
28 const PCI_VENDOR_ID: u32 = 0x0;
29 const INTEL_VENDOR_ID: u16 = 0x8086;
30 const PCI_COMMAND: u32 = 0x4;
31 const PCI_COMMAND_MEMORY: u8 = 0x2;
32 const PCI_BASE_CLASS_CODE: u32 = 0x0B;
33 
34 const PCI_INTERRUPT_PIN: u32 = 0x3D;
35 
36 struct VfioPciConfig {
37     device: Arc<VfioDevice>,
38 }
39 
40 impl VfioPciConfig {
new(device: Arc<VfioDevice>) -> Self41     fn new(device: Arc<VfioDevice>) -> Self {
42         VfioPciConfig { device }
43     }
44 
45     #[allow(dead_code)]
read_config_byte(&self, offset: u32) -> u846     fn read_config_byte(&self, offset: u32) -> u8 {
47         let mut data: [u8; 1] = [0];
48         self.device
49             .region_read(VFIO_PCI_CONFIG_REGION_INDEX, data.as_mut(), offset.into());
50 
51         data[0]
52     }
53 
54     #[allow(dead_code)]
read_config_word(&self, offset: u32) -> u1655     fn read_config_word(&self, offset: u32) -> u16 {
56         let mut data: [u8; 2] = [0, 0];
57         self.device
58             .region_read(VFIO_PCI_CONFIG_REGION_INDEX, data.as_mut(), offset.into());
59 
60         u16::from_le_bytes(data)
61     }
62 
63     #[allow(dead_code)]
read_config_dword(&self, offset: u32) -> u3264     fn read_config_dword(&self, offset: u32) -> u32 {
65         let mut data: [u8; 4] = [0, 0, 0, 0];
66         self.device
67             .region_read(VFIO_PCI_CONFIG_REGION_INDEX, data.as_mut(), offset.into());
68 
69         u32::from_le_bytes(data)
70     }
71 
72     #[allow(dead_code)]
write_config_byte(&self, buf: u8, offset: u32)73     fn write_config_byte(&self, buf: u8, offset: u32) {
74         self.device.region_write(
75             VFIO_PCI_CONFIG_REGION_INDEX,
76             ::std::slice::from_ref(&buf),
77             offset.into(),
78         )
79     }
80 
81     #[allow(dead_code)]
write_config_word(&self, buf: u16, offset: u32)82     fn write_config_word(&self, buf: u16, offset: u32) {
83         let data: [u8; 2] = buf.to_le_bytes();
84         self.device
85             .region_write(VFIO_PCI_CONFIG_REGION_INDEX, &data, offset.into())
86     }
87 
88     #[allow(dead_code)]
write_config_dword(&self, buf: u32, offset: u32)89     fn write_config_dword(&self, buf: u32, offset: u32) {
90         let data: [u8; 4] = buf.to_le_bytes();
91         self.device
92             .region_write(VFIO_PCI_CONFIG_REGION_INDEX, &data, offset.into())
93     }
94 }
95 
96 const PCI_CAPABILITY_LIST: u32 = 0x34;
97 const PCI_CAP_ID_MSI: u8 = 0x05;
98 const PCI_CAP_ID_MSIX: u8 = 0x11;
99 
100 // MSI registers
101 const PCI_MSI_NEXT_POINTER: u32 = 0x1; // Next cap pointer
102 const PCI_MSI_FLAGS: u32 = 0x2; // Message Control
103 const PCI_MSI_FLAGS_ENABLE: u16 = 0x0001; // MSI feature enabled
104 const PCI_MSI_FLAGS_64BIT: u16 = 0x0080; // 64-bit addresses allowed
105 const PCI_MSI_FLAGS_MASKBIT: u16 = 0x0100; // Per-vector masking capable
106 const PCI_MSI_ADDRESS_LO: u32 = 0x4; // MSI address lower 32 bits
107 const PCI_MSI_ADDRESS_HI: u32 = 0x8; // MSI address upper 32 bits (if 64 bit allowed)
108 const PCI_MSI_DATA_32: u32 = 0x8; // 16 bits of data for 32-bit message address
109 const PCI_MSI_DATA_64: u32 = 0xC; // 16 bits of date for 64-bit message address
110 
111 // MSI length
112 const MSI_LENGTH_32BIT_WITHOUT_MASK: u32 = 0xA;
113 const MSI_LENGTH_32BIT_WITH_MASK: u32 = 0x14;
114 const MSI_LENGTH_64BIT_WITHOUT_MASK: u32 = 0xE;
115 const MSI_LENGTH_64BIT_WITH_MASK: u32 = 0x18;
116 
117 enum VfioMsiChange {
118     Disable,
119     Enable,
120 }
121 
122 struct VfioMsiCap {
123     offset: u32,
124     is_64bit: bool,
125     mask_cap: bool,
126     ctl: u16,
127     address: u64,
128     data: u16,
129     vm_socket_irq: Tube,
130     irqfd: Option<Event>,
131     gsi: Option<u32>,
132 }
133 
134 impl VfioMsiCap {
new(config: &VfioPciConfig, msi_cap_start: u32, vm_socket_irq: Tube) -> Self135     fn new(config: &VfioPciConfig, msi_cap_start: u32, vm_socket_irq: Tube) -> Self {
136         let msi_ctl = config.read_config_word(msi_cap_start + PCI_MSI_FLAGS);
137 
138         VfioMsiCap {
139             offset: msi_cap_start,
140             is_64bit: (msi_ctl & PCI_MSI_FLAGS_64BIT) != 0,
141             mask_cap: (msi_ctl & PCI_MSI_FLAGS_MASKBIT) != 0,
142             ctl: 0,
143             address: 0,
144             data: 0,
145             vm_socket_irq,
146             irqfd: None,
147             gsi: None,
148         }
149     }
150 
is_msi_reg(&self, index: u64, len: usize) -> bool151     fn is_msi_reg(&self, index: u64, len: usize) -> bool {
152         let msi_len: u32 = if self.is_64bit {
153             if self.mask_cap {
154                 MSI_LENGTH_64BIT_WITH_MASK
155             } else {
156                 MSI_LENGTH_64BIT_WITHOUT_MASK
157             }
158         } else {
159             if self.mask_cap {
160                 MSI_LENGTH_32BIT_WITH_MASK
161             } else {
162                 MSI_LENGTH_32BIT_WITHOUT_MASK
163             }
164         };
165 
166         index >= self.offset as u64
167             && index + len as u64 <= (self.offset + msi_len) as u64
168             && len as u32 <= msi_len
169     }
170 
write_msi_reg(&mut self, index: u64, data: &[u8]) -> Option<VfioMsiChange>171     fn write_msi_reg(&mut self, index: u64, data: &[u8]) -> Option<VfioMsiChange> {
172         let len = data.len();
173         let offset = index as u32 - self.offset;
174         let mut ret: Option<VfioMsiChange> = None;
175         let old_address = self.address;
176         let old_data = self.data;
177 
178         // write msi ctl
179         if len == 2 && offset == PCI_MSI_FLAGS {
180             let was_enabled = self.is_msi_enabled();
181             let value: [u8; 2] = [data[0], data[1]];
182             self.ctl = u16::from_le_bytes(value);
183             let is_enabled = self.is_msi_enabled();
184             if !was_enabled && is_enabled {
185                 self.enable();
186                 ret = Some(VfioMsiChange::Enable);
187             } else if was_enabled && !is_enabled {
188                 ret = Some(VfioMsiChange::Disable)
189             }
190         } else if len == 4 && offset == PCI_MSI_ADDRESS_LO && !self.is_64bit {
191             //write 32 bit message address
192             let value: [u8; 8] = [data[0], data[1], data[2], data[3], 0, 0, 0, 0];
193             self.address = u64::from_le_bytes(value);
194         } else if len == 4 && offset == PCI_MSI_ADDRESS_LO && self.is_64bit {
195             // write 64 bit message address low part
196             let value: [u8; 8] = [data[0], data[1], data[2], data[3], 0, 0, 0, 0];
197             self.address &= !0xffffffff;
198             self.address |= u64::from_le_bytes(value);
199         } else if len == 4 && offset == PCI_MSI_ADDRESS_HI && self.is_64bit {
200             //write 64 bit message address high part
201             let value: [u8; 8] = [0, 0, 0, 0, data[0], data[1], data[2], data[3]];
202             self.address &= 0xffffffff;
203             self.address |= u64::from_le_bytes(value);
204         } else if len == 8 && offset == PCI_MSI_ADDRESS_LO && self.is_64bit {
205             // write 64 bit message address
206             let value: [u8; 8] = [
207                 data[0], data[1], data[2], data[3], data[4], data[5], data[6], data[7],
208             ];
209             self.address = u64::from_le_bytes(value);
210         } else if len == 2
211             && ((offset == PCI_MSI_DATA_32 && !self.is_64bit)
212                 || (offset == PCI_MSI_DATA_64 && self.is_64bit))
213         {
214             // write message data
215             let value: [u8; 2] = [data[0], data[1]];
216             self.data = u16::from_le_bytes(value);
217         }
218 
219         if self.is_msi_enabled() && (old_address != self.address || old_data != self.data) {
220             self.add_msi_route();
221         }
222 
223         ret
224     }
225 
is_msi_enabled(&self) -> bool226     fn is_msi_enabled(&self) -> bool {
227         self.ctl & PCI_MSI_FLAGS_ENABLE == PCI_MSI_FLAGS_ENABLE
228     }
229 
add_msi_route(&self)230     fn add_msi_route(&self) {
231         let gsi = match self.gsi {
232             Some(g) => g,
233             None => {
234                 error!("Add msi route but gsi is none");
235                 return;
236             }
237         };
238         if let Err(e) = self.vm_socket_irq.send(&VmIrqRequest::AddMsiRoute {
239             gsi,
240             msi_address: self.address,
241             msi_data: self.data.into(),
242         }) {
243             error!("failed to send AddMsiRoute request at {:?}", e);
244             return;
245         }
246         match self.vm_socket_irq.recv() {
247             Ok(VmIrqResponse::Err(e)) => error!("failed to call AddMsiRoute request {:?}", e),
248             Ok(_) => {}
249             Err(e) => error!("failed to receive AddMsiRoute response {:?}", e),
250         }
251     }
252 
allocate_one_msi(&mut self)253     fn allocate_one_msi(&mut self) {
254         let irqfd = match self.irqfd.take() {
255             Some(e) => e,
256             None => match Event::new() {
257                 Ok(e) => e,
258                 Err(e) => {
259                     error!("failed to create event: {:?}", e);
260                     return;
261                 }
262             },
263         };
264 
265         let request = VmIrqRequest::AllocateOneMsi { irqfd };
266         let request_result = self.vm_socket_irq.send(&request);
267 
268         // Stash the irqfd in self immediately because we used take above.
269         self.irqfd = match request {
270             VmIrqRequest::AllocateOneMsi { irqfd } => Some(irqfd),
271             _ => unreachable!(),
272         };
273 
274         if let Err(e) = request_result {
275             error!("failed to send AllocateOneMsi request: {:?}", e);
276             return;
277         }
278 
279         match self.vm_socket_irq.recv() {
280             Ok(VmIrqResponse::AllocateOneMsi { gsi }) => self.gsi = Some(gsi),
281             _ => error!("failed to receive AllocateOneMsi Response"),
282         }
283     }
284 
enable(&mut self)285     fn enable(&mut self) {
286         if self.gsi.is_none() || self.irqfd.is_none() {
287             self.allocate_one_msi();
288         }
289 
290         self.add_msi_route();
291     }
292 
get_msi_irqfd(&self) -> Option<&Event>293     fn get_msi_irqfd(&self) -> Option<&Event> {
294         self.irqfd.as_ref()
295     }
296 }
297 
298 // MSI-X registers in MSI-X capability
299 const PCI_MSIX_FLAGS: u32 = 0x02; // Message Control
300 const PCI_MSIX_FLAGS_QSIZE: u16 = 0x07FF; // Table size
301 const PCI_MSIX_TABLE: u32 = 0x04; // Table offset
302 const PCI_MSIX_TABLE_BIR: u32 = 0x07; // BAR index
303 const PCI_MSIX_TABLE_OFFSET: u32 = 0xFFFFFFF8; // Offset into specified BAR
304 const PCI_MSIX_PBA: u32 = 0x08; // Pending bit Array offset
305 const PCI_MSIX_PBA_BIR: u32 = 0x07; // BAR index
306 const PCI_MSIX_PBA_OFFSET: u32 = 0xFFFFFFF8; // Offset into specified BAR
307 
308 struct VfioMsixCap {
309     config: MsixConfig,
310     offset: u32,
311     table_size: u16,
312     table_pci_bar: u32,
313     table_offset: u64,
314     pba_pci_bar: u32,
315     pba_offset: u64,
316 }
317 
318 impl VfioMsixCap {
new(config: &VfioPciConfig, msix_cap_start: u32, vm_socket_irq: Tube) -> Self319     fn new(config: &VfioPciConfig, msix_cap_start: u32, vm_socket_irq: Tube) -> Self {
320         let msix_ctl = config.read_config_word(msix_cap_start + PCI_MSIX_FLAGS);
321         let table_size = (msix_ctl & PCI_MSIX_FLAGS_QSIZE) + 1;
322         let table = config.read_config_dword(msix_cap_start + PCI_MSIX_TABLE);
323         let table_pci_bar = table & PCI_MSIX_TABLE_BIR;
324         let table_offset = (table & PCI_MSIX_TABLE_OFFSET) as u64;
325         let pba = config.read_config_dword(msix_cap_start + PCI_MSIX_PBA);
326         let pba_pci_bar = pba & PCI_MSIX_PBA_BIR;
327         let pba_offset = (pba & PCI_MSIX_PBA_OFFSET) as u64;
328 
329         VfioMsixCap {
330             config: MsixConfig::new(table_size, vm_socket_irq),
331             offset: msix_cap_start,
332             table_size,
333             table_pci_bar,
334             table_offset,
335             pba_pci_bar,
336             pba_offset,
337         }
338     }
339 
340     // only msix control register is writable and need special handle in pci r/w
is_msix_control_reg(&self, offset: u32, size: u32) -> bool341     fn is_msix_control_reg(&self, offset: u32, size: u32) -> bool {
342         let control_start = self.offset + PCI_MSIX_FLAGS;
343         let control_end = control_start + 2;
344 
345         offset < control_end && offset + size > control_start
346     }
347 
read_msix_control(&self, data: &mut u32)348     fn read_msix_control(&self, data: &mut u32) {
349         *data = self.config.read_msix_capability(*data);
350     }
351 
write_msix_control(&mut self, data: &[u8]) -> Option<VfioMsiChange>352     fn write_msix_control(&mut self, data: &[u8]) -> Option<VfioMsiChange> {
353         let old_enabled = self.config.enabled();
354 
355         self.config
356             .write_msix_capability(PCI_MSIX_FLAGS.into(), data);
357 
358         let new_enabled = self.config.enabled();
359         if !old_enabled && new_enabled {
360             Some(VfioMsiChange::Enable)
361         } else if old_enabled && !new_enabled {
362             Some(VfioMsiChange::Disable)
363         } else {
364             None
365         }
366     }
367 
is_msix_table(&self, bar_index: u32, offset: u64) -> bool368     fn is_msix_table(&self, bar_index: u32, offset: u64) -> bool {
369         let table_size: u64 = (self.table_size * (MSIX_TABLE_ENTRIES_MODULO as u16)).into();
370         bar_index == self.table_pci_bar
371             && offset >= self.table_offset
372             && offset < self.table_offset + table_size
373     }
374 
read_table(&self, offset: u64, data: &mut [u8])375     fn read_table(&self, offset: u64, data: &mut [u8]) {
376         let offset = offset - self.table_offset;
377         self.config.read_msix_table(offset, data);
378     }
379 
write_table(&mut self, offset: u64, data: &[u8])380     fn write_table(&mut self, offset: u64, data: &[u8]) {
381         let offset = offset - self.table_offset;
382         self.config.write_msix_table(offset, data);
383     }
384 
is_msix_pba(&self, bar_index: u32, offset: u64) -> bool385     fn is_msix_pba(&self, bar_index: u32, offset: u64) -> bool {
386         let pba_size: u64 = (((self.table_size + BITS_PER_PBA_ENTRY as u16 - 1)
387             / BITS_PER_PBA_ENTRY as u16)
388             * MSIX_PBA_ENTRIES_MODULO as u16) as u64;
389         bar_index == self.pba_pci_bar
390             && offset >= self.pba_offset
391             && offset < self.pba_offset + pba_size
392     }
393 
read_pba(&self, offset: u64, data: &mut [u8])394     fn read_pba(&self, offset: u64, data: &mut [u8]) {
395         let offset = offset - self.pba_offset;
396         self.config.read_pba_entries(offset, data);
397     }
398 
write_pba(&mut self, offset: u64, data: &[u8])399     fn write_pba(&mut self, offset: u64, data: &[u8]) {
400         let offset = offset - self.pba_offset;
401         self.config.write_pba_entries(offset, data);
402     }
403 
is_msix_bar(&self, bar_index: u32) -> bool404     fn is_msix_bar(&self, bar_index: u32) -> bool {
405         bar_index == self.table_pci_bar || bar_index == self.pba_pci_bar
406     }
407 
get_msix_irqfds(&self) -> Option<Vec<&Event>>408     fn get_msix_irqfds(&self) -> Option<Vec<&Event>> {
409         let mut irqfds = Vec::new();
410 
411         for i in 0..self.table_size {
412             let irqfd = self.config.get_irqfd(i as usize);
413             if let Some(fd) = irqfd {
414                 irqfds.push(fd);
415             } else {
416                 return None;
417             }
418         }
419 
420         Some(irqfds)
421     }
422 }
423 
424 struct MmioInfo {
425     bar_index: u32,
426     start: u64,
427     length: u64,
428 }
429 
430 struct IoInfo {
431     bar_index: u32,
432 }
433 
434 enum DeviceData {
435     IntelGfxData { opregion_index: u32 },
436 }
437 
438 /// Implements the Vfio Pci device, then a pci device is added into vm
439 pub struct VfioPciDevice {
440     device: Arc<VfioDevice>,
441     config: VfioPciConfig,
442     pci_address: Option<PciAddress>,
443     interrupt_evt: Option<Event>,
444     interrupt_resample_evt: Option<Event>,
445     mmio_regions: Vec<MmioInfo>,
446     io_regions: Vec<IoInfo>,
447     msi_cap: Option<VfioMsiCap>,
448     msix_cap: Option<VfioMsixCap>,
449     irq_type: Option<VfioIrqType>,
450     vm_socket_mem: Tube,
451     device_data: Option<DeviceData>,
452 
453     // scratch MemoryMapping to avoid unmap beform vm exit
454     mem: Vec<MemoryMapping>,
455 }
456 
457 impl VfioPciDevice {
458     /// Constructs a new Vfio Pci device for the give Vfio device
new( device: VfioDevice, vfio_device_socket_msi: Tube, vfio_device_socket_msix: Tube, vfio_device_socket_mem: Tube, ) -> Self459     pub fn new(
460         device: VfioDevice,
461         vfio_device_socket_msi: Tube,
462         vfio_device_socket_msix: Tube,
463         vfio_device_socket_mem: Tube,
464     ) -> Self {
465         let dev = Arc::new(device);
466         let config = VfioPciConfig::new(Arc::clone(&dev));
467         let mut msi_socket = Some(vfio_device_socket_msi);
468         let mut msix_socket = Some(vfio_device_socket_msix);
469         let mut msi_cap: Option<VfioMsiCap> = None;
470         let mut msix_cap: Option<VfioMsixCap> = None;
471 
472         let mut cap_next: u32 = config.read_config_byte(PCI_CAPABILITY_LIST).into();
473         while cap_next != 0 {
474             let cap_id = config.read_config_byte(cap_next);
475             if cap_id == PCI_CAP_ID_MSI {
476                 if let Some(msi_socket) = msi_socket.take() {
477                     msi_cap = Some(VfioMsiCap::new(&config, cap_next, msi_socket));
478                 }
479             } else if cap_id == PCI_CAP_ID_MSIX {
480                 if let Some(msix_socket) = msix_socket.take() {
481                     msix_cap = Some(VfioMsixCap::new(&config, cap_next, msix_socket));
482                 }
483             }
484             let offset = cap_next + PCI_MSI_NEXT_POINTER;
485             cap_next = config.read_config_byte(offset).into();
486         }
487 
488         let vendor_id = config.read_config_word(PCI_VENDOR_ID);
489         let class_code = config.read_config_byte(PCI_BASE_CLASS_CODE);
490 
491         let is_intel_gfx = vendor_id == INTEL_VENDOR_ID
492             && class_code == PciClassCode::DisplayController.get_register_value();
493         let device_data = if is_intel_gfx {
494             Some(DeviceData::IntelGfxData {
495                 opregion_index: u32::max_value(),
496             })
497         } else {
498             None
499         };
500 
501         VfioPciDevice {
502             device: dev,
503             config,
504             pci_address: None,
505             interrupt_evt: None,
506             interrupt_resample_evt: None,
507             mmio_regions: Vec::new(),
508             io_regions: Vec::new(),
509             msi_cap,
510             msix_cap,
511             irq_type: None,
512             vm_socket_mem: vfio_device_socket_mem,
513             device_data,
514             mem: Vec::new(),
515         }
516     }
517 
is_intel_gfx(&self) -> bool518     fn is_intel_gfx(&self) -> bool {
519         let mut ret = false;
520 
521         if let Some(device_data) = &self.device_data {
522             match *device_data {
523                 DeviceData::IntelGfxData { .. } => ret = true,
524             }
525         }
526 
527         ret
528     }
529 
find_region(&self, addr: u64) -> Option<MmioInfo>530     fn find_region(&self, addr: u64) -> Option<MmioInfo> {
531         for mmio_info in self.mmio_regions.iter() {
532             if addr >= mmio_info.start && addr < mmio_info.start + mmio_info.length {
533                 return Some(MmioInfo {
534                     bar_index: mmio_info.bar_index,
535                     start: mmio_info.start,
536                     length: mmio_info.length,
537                 });
538             }
539         }
540 
541         None
542     }
543 
enable_intx(&mut self)544     fn enable_intx(&mut self) {
545         if self.interrupt_evt.is_none() || self.interrupt_resample_evt.is_none() {
546             return;
547         }
548 
549         if let Some(ref interrupt_evt) = self.interrupt_evt {
550             let mut fds = Vec::new();
551             fds.push(interrupt_evt);
552             if let Err(e) = self.device.irq_enable(fds, VFIO_PCI_INTX_IRQ_INDEX) {
553                 error!("Intx enable failed: {}", e);
554                 return;
555             }
556             if let Some(ref irq_resample_evt) = self.interrupt_resample_evt {
557                 if let Err(e) = self.device.irq_mask(VFIO_PCI_INTX_IRQ_INDEX) {
558                     error!("Intx mask failed: {}", e);
559                     self.disable_intx();
560                     return;
561                 }
562                 if let Err(e) = self
563                     .device
564                     .resample_virq_enable(irq_resample_evt, VFIO_PCI_INTX_IRQ_INDEX)
565                 {
566                     error!("resample enable failed: {}", e);
567                     self.disable_intx();
568                     return;
569                 }
570                 if let Err(e) = self.device.irq_unmask(VFIO_PCI_INTX_IRQ_INDEX) {
571                     error!("Intx unmask failed: {}", e);
572                     self.disable_intx();
573                     return;
574                 }
575             }
576         }
577 
578         self.irq_type = Some(VfioIrqType::Intx);
579     }
580 
disable_intx(&mut self)581     fn disable_intx(&mut self) {
582         if let Err(e) = self.device.irq_disable(VFIO_PCI_INTX_IRQ_INDEX) {
583             error!("Intx disable failed: {}", e);
584         }
585         self.irq_type = None;
586     }
587 
disable_irqs(&mut self)588     fn disable_irqs(&mut self) {
589         match self.irq_type {
590             Some(VfioIrqType::Msi) => self.disable_msi(),
591             Some(VfioIrqType::Msix) => self.disable_msix(),
592             _ => (),
593         }
594 
595         // Above disable_msi() or disable_msix() will enable intx again.
596         // so disable_intx here again.
597         if let Some(VfioIrqType::Intx) = self.irq_type {
598             self.disable_intx();
599         }
600     }
601 
enable_msi(&mut self)602     fn enable_msi(&mut self) {
603         self.disable_irqs();
604 
605         let irqfd = match &self.msi_cap {
606             Some(cap) => {
607                 if let Some(fd) = cap.get_msi_irqfd() {
608                     fd
609                 } else {
610                     self.enable_intx();
611                     return;
612                 }
613             }
614             None => {
615                 self.enable_intx();
616                 return;
617             }
618         };
619 
620         let mut fds = Vec::new();
621         fds.push(irqfd);
622         if let Err(e) = self.device.irq_enable(fds, VFIO_PCI_MSI_IRQ_INDEX) {
623             error!("failed to enable msi: {}", e);
624             self.enable_intx();
625             return;
626         }
627 
628         self.irq_type = Some(VfioIrqType::Msi);
629     }
630 
disable_msi(&mut self)631     fn disable_msi(&mut self) {
632         if let Err(e) = self.device.irq_disable(VFIO_PCI_MSI_IRQ_INDEX) {
633             error!("failed to disable msi: {}", e);
634             return;
635         }
636 
637         self.enable_intx();
638     }
639 
enable_msix(&mut self)640     fn enable_msix(&mut self) {
641         self.disable_irqs();
642 
643         let irqfds = match &self.msix_cap {
644             Some(cap) => cap.get_msix_irqfds(),
645             None => return,
646         };
647 
648         if let Some(descriptors) = irqfds {
649             if let Err(e) = self.device.irq_enable(descriptors, VFIO_PCI_MSIX_IRQ_INDEX) {
650                 error!("failed to enable msix: {}", e);
651                 self.enable_intx();
652                 return;
653             }
654         } else {
655             self.enable_intx();
656             return;
657         }
658 
659         self.irq_type = Some(VfioIrqType::Msix);
660     }
661 
disable_msix(&mut self)662     fn disable_msix(&mut self) {
663         if let Err(e) = self.device.irq_disable(VFIO_PCI_MSIX_IRQ_INDEX) {
664             error!("failed to disable msix: {}", e);
665             return;
666         }
667 
668         self.enable_intx();
669     }
670 
add_bar_mmap(&self, index: u32, bar_addr: u64) -> Vec<MemoryMapping>671     fn add_bar_mmap(&self, index: u32, bar_addr: u64) -> Vec<MemoryMapping> {
672         let mut mem_map: Vec<MemoryMapping> = Vec::new();
673         if self.device.get_region_flags(index) & VFIO_REGION_INFO_FLAG_MMAP != 0 {
674             // the bar storing msix table and pba couldn't mmap.
675             // these bars should be trapped, so that msix could be emulated.
676             if let Some(msix_cap) = &self.msix_cap {
677                 if msix_cap.is_msix_bar(index) {
678                     return mem_map;
679                 }
680             }
681 
682             let mmaps = self.device.get_region_mmap(index);
683             if mmaps.is_empty() {
684                 return mem_map;
685             }
686 
687             for mmap in mmaps.iter() {
688                 let mmap_offset = mmap.offset;
689                 let mmap_size = mmap.size;
690                 let guest_map_start = bar_addr + mmap_offset;
691                 let region_offset = self.device.get_region_offset(index);
692                 let offset = region_offset + mmap_offset;
693                 let descriptor = match self.device.device_file().try_clone() {
694                     Ok(device_file) => device_file.into(),
695                     Err(_) => break,
696                 };
697                 if self
698                     .vm_socket_mem
699                     .send(&VmMemoryRequest::RegisterMmapMemory {
700                         descriptor,
701                         size: mmap_size as usize,
702                         offset,
703                         gpa: guest_map_start,
704                     })
705                     .is_err()
706                 {
707                     break;
708                 }
709 
710                 let response: VmMemoryResponse = match self.vm_socket_mem.recv() {
711                     Ok(res) => res,
712                     Err(_) => break,
713                 };
714                 match response {
715                     VmMemoryResponse::Ok => {
716                         // Even if vm has mapped this region, but it is in vm main process,
717                         // device process doesn't has this mapping, but vfio_dma_map() need it
718                         // in device process, so here map it again.
719                         let mmap = match MemoryMappingBuilder::new(mmap_size as usize)
720                             .from_file(self.device.device_file())
721                             .offset(offset)
722                             .build()
723                         {
724                             Ok(v) => v,
725                             Err(_e) => break,
726                         };
727                         let host = (&mmap).as_ptr() as u64;
728                         let pgsz = pagesize() as u64;
729                         let size = (mmap_size + pgsz - 1) / pgsz * pgsz;
730                         // Safe because the given guest_map_start is valid guest bar address. and
731                         // the host pointer is correct and valid guaranteed by MemoryMapping interface.
732                         // The size will be extened to page size aligned if it is not which is also
733                         // safe because VFIO actually maps the BAR with page size aligned size.
734                         match unsafe { self.device.vfio_dma_map(guest_map_start, size, host) } {
735                             Ok(_) => mem_map.push(mmap),
736                             Err(e) => {
737                                 error!(
738                                     "{}, index: {}, bar_addr:0x{:x}, host:0x{:x}",
739                                     e, index, bar_addr, host
740                                 );
741                                 break;
742                             }
743                         }
744                     }
745                     _ => break,
746                 }
747             }
748         }
749 
750         mem_map
751     }
752 
enable_bars_mmap(&mut self)753     fn enable_bars_mmap(&mut self) {
754         for mmio_info in self.mmio_regions.iter() {
755             let mut mem_map = self.add_bar_mmap(mmio_info.bar_index, mmio_info.start);
756             self.mem.append(&mut mem_map);
757         }
758     }
759 }
760 
761 impl PciDevice for VfioPciDevice {
debug_label(&self) -> String762     fn debug_label(&self) -> String {
763         format!("vfio {} device", self.device.device_name())
764     }
765 
allocate_address( &mut self, resources: &mut SystemAllocator, ) -> Result<PciAddress, PciDeviceError>766     fn allocate_address(
767         &mut self,
768         resources: &mut SystemAllocator,
769     ) -> Result<PciAddress, PciDeviceError> {
770         if self.pci_address.is_none() {
771             let address = PciAddress::from_string(self.device.device_name());
772             if resources.reserve_pci(
773                 Alloc::PciBar {
774                     bus: address.bus,
775                     dev: address.dev,
776                     func: address.func,
777                     bar: 0,
778                 },
779                 self.debug_label(),
780             ) {
781                 self.pci_address = Some(address);
782             }
783         }
784         self.pci_address.ok_or(PciDeviceError::PciAllocationFailed)
785     }
786 
keep_rds(&self) -> Vec<RawDescriptor>787     fn keep_rds(&self) -> Vec<RawDescriptor> {
788         let mut rds = self.device.keep_rds();
789         if let Some(ref interrupt_evt) = self.interrupt_evt {
790             rds.push(interrupt_evt.as_raw_descriptor());
791         }
792         if let Some(ref interrupt_resample_evt) = self.interrupt_resample_evt {
793             rds.push(interrupt_resample_evt.as_raw_descriptor());
794         }
795         rds.push(self.vm_socket_mem.as_raw_descriptor());
796         if let Some(msi_cap) = &self.msi_cap {
797             rds.push(msi_cap.vm_socket_irq.as_raw_descriptor());
798         }
799         if let Some(msix_cap) = &self.msix_cap {
800             rds.push(msix_cap.config.as_raw_descriptor());
801         }
802         rds
803     }
804 
assign_irq( &mut self, irq_evt: Event, irq_resample_evt: Event, irq_num: u32, _irq_pin: PciInterruptPin, )805     fn assign_irq(
806         &mut self,
807         irq_evt: Event,
808         irq_resample_evt: Event,
809         irq_num: u32,
810         _irq_pin: PciInterruptPin,
811     ) {
812         self.config.write_config_byte(irq_num as u8, 0x3C);
813         self.interrupt_evt = Some(irq_evt);
814         self.interrupt_resample_evt = Some(irq_resample_evt);
815 
816         // enable INTX
817         if self.config.read_config_byte(PCI_INTERRUPT_PIN) > 0 {
818             self.enable_intx();
819         }
820     }
821 
allocate_io_bars( &mut self, resources: &mut SystemAllocator, ) -> Result<Vec<(u64, u64)>, PciDeviceError>822     fn allocate_io_bars(
823         &mut self,
824         resources: &mut SystemAllocator,
825     ) -> Result<Vec<(u64, u64)>, PciDeviceError> {
826         let mut ranges = Vec::new();
827         let mut i = VFIO_PCI_BAR0_REGION_INDEX;
828         let address = self
829             .pci_address
830             .expect("allocate_address must be called prior to allocate_io_bars");
831 
832         while i <= VFIO_PCI_ROM_REGION_INDEX {
833             let mut low: u32 = 0xffffffff;
834             let offset: u32;
835             if i == VFIO_PCI_ROM_REGION_INDEX {
836                 offset = 0x30;
837             } else {
838                 offset = 0x10 + i * 4;
839             }
840             self.config.write_config_dword(low, offset);
841             low = self.config.read_config_dword(offset);
842 
843             let low_flag = low & 0xf;
844             let is_64bit = low_flag & 0x4 == 0x4;
845             if (low_flag & 0x1 == 0 || i == VFIO_PCI_ROM_REGION_INDEX) && low != 0 {
846                 let mut upper: u32 = 0xffffffff;
847                 if is_64bit {
848                     self.config.write_config_dword(upper, offset + 4);
849                     upper = self.config.read_config_dword(offset + 4);
850                 }
851 
852                 low &= 0xffff_fff0;
853                 let mut size: u64 = u64::from(upper);
854                 size <<= 32;
855                 size |= u64::from(low);
856                 size = !size + 1;
857                 let mmio_type = match is_64bit {
858                     false => MmioType::Low,
859                     true => MmioType::High,
860                 };
861                 let bar_addr = resources
862                     .mmio_allocator(mmio_type)
863                     .allocate_with_align(
864                         size,
865                         Alloc::PciBar {
866                             bus: address.bus,
867                             dev: address.dev,
868                             func: address.func,
869                             bar: i as u8,
870                         },
871                         "vfio_bar".to_string(),
872                         size,
873                     )
874                     .map_err(|e| PciDeviceError::IoAllocationFailed(size, e))?;
875                 ranges.push((bar_addr, size));
876                 self.mmio_regions.push(MmioInfo {
877                     bar_index: i,
878                     start: bar_addr,
879                     length: size,
880                 });
881 
882                 low = bar_addr as u32;
883                 low |= low_flag;
884                 self.config.write_config_dword(low, offset);
885                 if is_64bit {
886                     upper = (bar_addr >> 32) as u32;
887                     self.config.write_config_dword(upper, offset + 4);
888                 }
889             } else if low_flag & 0x1 == 0x1 {
890                 self.io_regions.push(IoInfo { bar_index: i });
891             }
892 
893             if is_64bit {
894                 i += 2;
895             } else {
896                 i += 1;
897             }
898         }
899 
900         // Quirk, enable igd memory for guest vga arbitrate, otherwise kernel vga arbitrate
901         // driver doesn't claim this vga device, then xorg couldn't boot up.
902         if self.is_intel_gfx() {
903             let mut cmd = self.config.read_config_byte(PCI_COMMAND);
904             cmd |= PCI_COMMAND_MEMORY;
905             self.config.write_config_byte(cmd, PCI_COMMAND);
906         }
907 
908         Ok(ranges)
909     }
910 
allocate_device_bars( &mut self, resources: &mut SystemAllocator, ) -> Result<Vec<(u64, u64)>, PciDeviceError>911     fn allocate_device_bars(
912         &mut self,
913         resources: &mut SystemAllocator,
914     ) -> Result<Vec<(u64, u64)>, PciDeviceError> {
915         let mut ranges = Vec::new();
916 
917         if !self.is_intel_gfx() {
918             return Ok(ranges);
919         }
920 
921         // Make intel gfx's opregion as mmio bar, and allocate a gpa for it
922         // then write this gpa into pci cfg register
923         if let Some((index, size)) = self.device.get_cap_type_info(
924             VFIO_REGION_TYPE_PCI_VENDOR_TYPE | (INTEL_VENDOR_ID as u32),
925             VFIO_REGION_SUBTYPE_INTEL_IGD_OPREGION,
926         ) {
927             let address = self
928                 .pci_address
929                 .expect("allocate_address must be called prior to allocate_device_bars");
930             let bar_addr = resources
931                 .mmio_allocator(MmioType::Low)
932                 .allocate(
933                     size,
934                     Alloc::PciBar {
935                         bus: address.bus,
936                         dev: address.dev,
937                         func: address.func,
938                         bar: (index * 4) as u8,
939                     },
940                     "vfio_bar".to_string(),
941                 )
942                 .map_err(|e| PciDeviceError::IoAllocationFailed(size, e))?;
943             ranges.push((bar_addr, size));
944             self.device_data = Some(DeviceData::IntelGfxData {
945                 opregion_index: index,
946             });
947 
948             self.mmio_regions.push(MmioInfo {
949                 bar_index: index,
950                 start: bar_addr,
951                 length: size,
952             });
953             self.config.write_config_dword(bar_addr as u32, 0xFC);
954         }
955 
956         Ok(ranges)
957     }
958 
register_device_capabilities(&mut self) -> Result<(), PciDeviceError>959     fn register_device_capabilities(&mut self) -> Result<(), PciDeviceError> {
960         Ok(())
961     }
962 
ioevents(&self) -> Vec<(&Event, u64, Datamatch)>963     fn ioevents(&self) -> Vec<(&Event, u64, Datamatch)> {
964         Vec::new()
965     }
966 
read_config_register(&self, reg_idx: usize) -> u32967     fn read_config_register(&self, reg_idx: usize) -> u32 {
968         let reg: u32 = (reg_idx * 4) as u32;
969 
970         let mut config = self.config.read_config_dword(reg);
971 
972         // Ignore IO bar
973         if (0x10..=0x24).contains(&reg) {
974             for io_info in self.io_regions.iter() {
975                 if io_info.bar_index * 4 + 0x10 == reg {
976                     config = 0;
977                 }
978             }
979         } else if let Some(msix_cap) = &self.msix_cap {
980             if msix_cap.is_msix_control_reg(reg, 4) {
981                 msix_cap.read_msix_control(&mut config);
982             }
983         }
984 
985         // Quirk for intel graphic, set stolen memory size to 0 in pci_cfg[0x51]
986         if self.is_intel_gfx() && reg == 0x50 {
987             config &= 0xffff00ff;
988         }
989 
990         config
991     }
992 
write_config_register(&mut self, reg_idx: usize, offset: u64, data: &[u8])993     fn write_config_register(&mut self, reg_idx: usize, offset: u64, data: &[u8]) {
994         let start = (reg_idx * 4) as u64 + offset;
995 
996         let mut msi_change: Option<VfioMsiChange> = None;
997         if let Some(msi_cap) = self.msi_cap.as_mut() {
998             if msi_cap.is_msi_reg(start, data.len()) {
999                 msi_change = msi_cap.write_msi_reg(start, data);
1000             }
1001         }
1002 
1003         match msi_change {
1004             Some(VfioMsiChange::Enable) => self.enable_msi(),
1005             Some(VfioMsiChange::Disable) => self.disable_msi(),
1006             None => (),
1007         }
1008 
1009         msi_change = None;
1010         if let Some(msix_cap) = self.msix_cap.as_mut() {
1011             if msix_cap.is_msix_control_reg(start as u32, data.len() as u32) {
1012                 msi_change = msix_cap.write_msix_control(data);
1013             }
1014         }
1015         match msi_change {
1016             Some(VfioMsiChange::Enable) => self.enable_msix(),
1017             Some(VfioMsiChange::Disable) => self.disable_msix(),
1018             None => (),
1019         }
1020 
1021         // if guest enable memory access, then enable bar mappable once
1022         if start == PCI_COMMAND as u64
1023             && data.len() == 2
1024             && data[0] & PCI_COMMAND_MEMORY == PCI_COMMAND_MEMORY
1025             && self.mem.is_empty()
1026         {
1027             self.enable_bars_mmap();
1028         }
1029 
1030         self.device
1031             .region_write(VFIO_PCI_CONFIG_REGION_INDEX, data, start);
1032     }
1033 
read_bar(&mut self, addr: u64, data: &mut [u8])1034     fn read_bar(&mut self, addr: u64, data: &mut [u8]) {
1035         if let Some(mmio_info) = self.find_region(addr) {
1036             let offset = addr - mmio_info.start;
1037             let bar_index = mmio_info.bar_index;
1038             if let Some(msix_cap) = &self.msix_cap {
1039                 if msix_cap.is_msix_table(bar_index, offset) {
1040                     msix_cap.read_table(offset, data);
1041                     return;
1042                 } else if msix_cap.is_msix_pba(bar_index, offset) {
1043                     msix_cap.read_pba(offset, data);
1044                     return;
1045                 }
1046             }
1047             self.device.region_read(bar_index, data, offset);
1048         }
1049     }
1050 
write_bar(&mut self, addr: u64, data: &[u8])1051     fn write_bar(&mut self, addr: u64, data: &[u8]) {
1052         if let Some(mmio_info) = self.find_region(addr) {
1053             // Ignore igd opregion's write
1054             if let Some(device_data) = &self.device_data {
1055                 match *device_data {
1056                     DeviceData::IntelGfxData { opregion_index } => {
1057                         if opregion_index == mmio_info.bar_index {
1058                             return;
1059                         }
1060                     }
1061                 }
1062             }
1063 
1064             let offset = addr - mmio_info.start;
1065             let bar_index = mmio_info.bar_index;
1066 
1067             if let Some(msix_cap) = self.msix_cap.as_mut() {
1068                 if msix_cap.is_msix_table(bar_index, offset) {
1069                     msix_cap.write_table(offset, data);
1070                     return;
1071                 } else if msix_cap.is_msix_pba(bar_index, offset) {
1072                     msix_cap.write_pba(offset, data);
1073                     return;
1074                 }
1075             }
1076 
1077             self.device.region_write(bar_index, data, offset);
1078         }
1079     }
1080 }
1081