• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2019 The ChromiumOS Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 use std::cell::RefCell;
6 use std::collections::HashMap;
7 use std::ffi::CString;
8 use std::fs::File;
9 use std::fs::OpenOptions;
10 use std::io;
11 use std::mem;
12 use std::os::raw::c_ulong;
13 use std::os::unix::prelude::FileExt;
14 use std::path::Path;
15 use std::path::PathBuf;
16 use std::slice;
17 use std::sync::Arc;
18 use std::u32;
19 
20 use base::error;
21 use base::ioctl;
22 use base::ioctl_with_mut_ptr;
23 use base::ioctl_with_mut_ref;
24 use base::ioctl_with_ptr;
25 use base::ioctl_with_ref;
26 use base::ioctl_with_val;
27 use base::warn;
28 use base::AsRawDescriptor;
29 use base::Error;
30 use base::Event;
31 use base::FromRawDescriptor;
32 use base::RawDescriptor;
33 use base::SafeDescriptor;
34 use data_model::vec_with_array_field;
35 use data_model::zerocopy_from_reader;
36 use hypervisor::DeviceKind;
37 use hypervisor::Vm;
38 use once_cell::sync::OnceCell;
39 use remain::sorted;
40 use resources::address_allocator::AddressAllocator;
41 use resources::AddressRange;
42 use resources::Alloc;
43 use resources::Error as ResourcesError;
44 use sync::Mutex;
45 use thiserror::Error;
46 use vfio_sys::*;
47 use vm_memory::MemoryRegionInformation;
48 use zerocopy::AsBytes;
49 use zerocopy::FromBytes;
50 
51 use crate::IommuDevType;
52 
53 #[sorted]
54 #[derive(Error, Debug)]
55 pub enum VfioError {
56     #[error("failed to borrow global vfio container")]
57     BorrowVfioContainer,
58     #[error("failed to duplicate VfioContainer")]
59     ContainerDupError,
60     #[error("failed to set container's IOMMU driver type as VfioType1V2: {0}")]
61     ContainerSetIOMMU(Error),
62     #[error("failed to create KVM vfio device: {0}")]
63     CreateVfioKvmDevice(Error),
64     #[error("failed to get Group Status: {0}")]
65     GetGroupStatus(Error),
66     #[error("failed to get vfio device fd: {0}")]
67     GroupGetDeviceFD(Error),
68     #[error("failed to add vfio group into vfio container: {0}")]
69     GroupSetContainer(Error),
70     #[error("group is inviable")]
71     GroupViable,
72     #[error("invalid region index: {0}")]
73     InvalidIndex(u32),
74     #[error("invalid file path")]
75     InvalidPath,
76     #[error("failed to add guest memory map into iommu table: {0}")]
77     IommuDmaMap(Error),
78     #[error("failed to remove guest memory map from iommu table: {0}")]
79     IommuDmaUnmap(Error),
80     #[error("failed to get IOMMU cap info from host")]
81     IommuGetCapInfo,
82     #[error("failed to get IOMMU info from host: {0}")]
83     IommuGetInfo(Error),
84     #[error("failed to set KVM vfio device's attribute: {0}")]
85     KvmSetDeviceAttr(Error),
86     #[error("AddressAllocator is unavailable")]
87     NoRescAlloc,
88     #[error("failed to open /dev/vfio/vfio container: {0}")]
89     OpenContainer(io::Error),
90     #[error("failed to open {1} group: {0}")]
91     OpenGroup(io::Error, String),
92     #[error("resources error: {0}")]
93     Resources(ResourcesError),
94     #[error("unknown vfio device type (flags: {0:#x})")]
95     UnknownDeviceType(u32),
96     #[error(
97         "vfio API version doesn't match with VFIO_API_VERSION defined in vfio_sys/src/vfio.rs"
98     )]
99     VfioApiVersion,
100     #[error("failed to get vfio device's info or info doesn't match: {0}")]
101     VfioDeviceGetInfo(Error),
102     #[error("failed to get vfio device's region info: {0}")]
103     VfioDeviceGetRegionInfo(Error),
104     #[error("failed to disable vfio deviece's irq: {0}")]
105     VfioIrqDisable(Error),
106     #[error("failed to enable vfio deviece's irq: {0}")]
107     VfioIrqEnable(Error),
108     #[error("failed to mask vfio deviece's irq: {0}")]
109     VfioIrqMask(Error),
110     #[error("failed to unmask vfio deviece's irq: {0}")]
111     VfioIrqUnmask(Error),
112     #[error("failed to enter vfio deviece's low power state: {0}")]
113     VfioPmLowPowerEnter(Error),
114     #[error("failed to exit vfio deviece's low power state: {0}")]
115     VfioPmLowPowerExit(Error),
116     #[error("container dones't support VfioType1V2 IOMMU driver type")]
117     VfioType1V2,
118 }
119 
120 type Result<T> = std::result::Result<T, VfioError>;
121 
get_error() -> Error122 fn get_error() -> Error {
123     Error::last()
124 }
125 
126 static KVM_VFIO_FILE: OnceCell<SafeDescriptor> = OnceCell::new();
127 
128 #[derive(Copy, Clone, Debug, PartialEq, Eq)]
129 pub enum VfioDeviceType {
130     Pci,
131     Platform,
132 }
133 
134 enum KvmVfioGroupOps {
135     Add,
136     Delete,
137 }
138 
139 #[repr(u32)]
140 enum IommuType {
141     Type1V2 = VFIO_TYPE1v2_IOMMU,
142     // ChromeOS specific vfio_iommu_type1 implementation that is optimized for
143     // small, dynamic mappings. For clients which create large, relatively
144     // static mappings, Type1V2 is still preferred.
145     //
146     // See crrev.com/c/3593528 for the implementation.
147     Type1ChromeOS = 100001,
148 }
149 
150 // Hint as to whether IOMMU mappings will tend to be large and static or
151 // small and dynamic.
152 #[derive(PartialEq, Eq)]
153 enum IommuMappingHint {
154     Static,
155     Dynamic,
156 }
157 
158 /// VfioContainer contain multi VfioGroup, and delegate an IOMMU domain table
159 pub struct VfioContainer {
160     container: File,
161     groups: HashMap<u32, Arc<Mutex<VfioGroup>>>,
162 }
163 
extract_vfio_struct<T>(bytes: &[u8], offset: usize) -> T where T: FromBytes,164 fn extract_vfio_struct<T>(bytes: &[u8], offset: usize) -> T
165 where
166     T: FromBytes,
167 {
168     zerocopy_from_reader(&bytes[offset..(offset + mem::size_of::<T>())])
169         .expect("malformed kernel data")
170 }
171 
172 const VFIO_API_VERSION: u8 = 0;
173 impl VfioContainer {
new() -> Result<Self>174     pub fn new() -> Result<Self> {
175         let container = OpenOptions::new()
176             .read(true)
177             .write(true)
178             .open("/dev/vfio/vfio")
179             .map_err(VfioError::OpenContainer)?;
180 
181         Self::new_from_container(container)
182     }
183 
184     // Construct a VfioContainer from an exist container file.
new_from_container(container: File) -> Result<Self>185     pub fn new_from_container(container: File) -> Result<Self> {
186         // Safe as file is vfio container descriptor and ioctl is defined by kernel.
187         let version = unsafe { ioctl(&container, VFIO_GET_API_VERSION()) };
188         if version as u8 != VFIO_API_VERSION {
189             return Err(VfioError::VfioApiVersion);
190         }
191 
192         Ok(VfioContainer {
193             container,
194             groups: HashMap::new(),
195         })
196     }
197 
is_group_set(&self, group_id: u32) -> bool198     fn is_group_set(&self, group_id: u32) -> bool {
199         self.groups.get(&group_id).is_some()
200     }
201 
check_extension(&self, val: IommuType) -> bool202     fn check_extension(&self, val: IommuType) -> bool {
203         // Safe as file is vfio container and make sure val is valid.
204         let ret = unsafe { ioctl_with_val(self, VFIO_CHECK_EXTENSION(), val as c_ulong) };
205         ret == 1
206     }
207 
set_iommu(&self, val: IommuType) -> i32208     fn set_iommu(&self, val: IommuType) -> i32 {
209         // Safe as file is vfio container and make sure val is valid.
210         unsafe { ioctl_with_val(self, VFIO_SET_IOMMU(), val as c_ulong) }
211     }
212 
vfio_dma_map( &self, iova: u64, size: u64, user_addr: u64, write_en: bool, ) -> Result<()>213     pub unsafe fn vfio_dma_map(
214         &self,
215         iova: u64,
216         size: u64,
217         user_addr: u64,
218         write_en: bool,
219     ) -> Result<()> {
220         let mut dma_map = vfio_iommu_type1_dma_map {
221             argsz: mem::size_of::<vfio_iommu_type1_dma_map>() as u32,
222             flags: VFIO_DMA_MAP_FLAG_READ,
223             vaddr: user_addr,
224             iova,
225             size,
226         };
227 
228         if write_en {
229             dma_map.flags |= VFIO_DMA_MAP_FLAG_WRITE;
230         }
231 
232         let ret = ioctl_with_ref(self, VFIO_IOMMU_MAP_DMA(), &dma_map);
233         if ret != 0 {
234             return Err(VfioError::IommuDmaMap(get_error()));
235         }
236 
237         Ok(())
238     }
239 
vfio_dma_unmap(&self, iova: u64, size: u64) -> Result<()>240     pub fn vfio_dma_unmap(&self, iova: u64, size: u64) -> Result<()> {
241         let mut dma_unmap = vfio_iommu_type1_dma_unmap {
242             argsz: mem::size_of::<vfio_iommu_type1_dma_unmap>() as u32,
243             flags: 0,
244             iova,
245             size,
246             ..Default::default()
247         };
248 
249         // Safe as file is vfio container, dma_unmap is constructed by us, and
250         // we check the return value
251         let ret = unsafe { ioctl_with_mut_ref(self, VFIO_IOMMU_UNMAP_DMA(), &mut dma_unmap) };
252         if ret != 0 || dma_unmap.size != size {
253             return Err(VfioError::IommuDmaUnmap(get_error()));
254         }
255 
256         Ok(())
257     }
258 
vfio_get_iommu_page_size_mask(&self) -> Result<u64>259     pub fn vfio_get_iommu_page_size_mask(&self) -> Result<u64> {
260         let mut iommu_info = vfio_iommu_type1_info {
261             argsz: mem::size_of::<vfio_iommu_type1_info>() as u32,
262             flags: 0,
263             iova_pgsizes: 0,
264             ..Default::default()
265         };
266 
267         // Safe as file is vfio container, iommu_info has valid values,
268         // and we check the return value
269         let ret = unsafe { ioctl_with_mut_ref(self, VFIO_IOMMU_GET_INFO(), &mut iommu_info) };
270         if ret != 0 || (iommu_info.flags & VFIO_IOMMU_INFO_PGSIZES) == 0 {
271             return Err(VfioError::IommuGetInfo(get_error()));
272         }
273 
274         Ok(iommu_info.iova_pgsizes)
275     }
276 
vfio_iommu_iova_get_iova_ranges(&self) -> Result<Vec<AddressRange>>277     pub fn vfio_iommu_iova_get_iova_ranges(&self) -> Result<Vec<AddressRange>> {
278         // Query the buffer size needed fetch the capabilities.
279         let mut iommu_info_argsz = vfio_iommu_type1_info {
280             argsz: mem::size_of::<vfio_iommu_type1_info>() as u32,
281             flags: 0,
282             iova_pgsizes: 0,
283             ..Default::default()
284         };
285 
286         // Safe as file is vfio container, iommu_info_argsz has valid values,
287         // and we check the return value
288         let ret = unsafe { ioctl_with_mut_ref(self, VFIO_IOMMU_GET_INFO(), &mut iommu_info_argsz) };
289         if ret != 0 {
290             return Err(VfioError::IommuGetInfo(get_error()));
291         }
292 
293         if (iommu_info_argsz.flags & VFIO_IOMMU_INFO_CAPS) == 0 {
294             return Err(VfioError::IommuGetCapInfo);
295         }
296 
297         let mut iommu_info = vec_with_array_field::<vfio_iommu_type1_info, u8>(
298             iommu_info_argsz.argsz as usize - mem::size_of::<vfio_iommu_type1_info>(),
299         );
300         iommu_info[0].argsz = iommu_info_argsz.argsz;
301         // Safe as file is vfio container, iommu_info has valid values,
302         // and we check the return value
303         let ret =
304             unsafe { ioctl_with_mut_ptr(self, VFIO_IOMMU_GET_INFO(), iommu_info.as_mut_ptr()) };
305         if ret != 0 {
306             return Err(VfioError::IommuGetInfo(get_error()));
307         }
308 
309         // Safe because we initialized iommu_info with enough space, u8 has less strict
310         // alignment, and since it will no longer be mutated.
311         let info_bytes = unsafe {
312             std::slice::from_raw_parts(
313                 iommu_info.as_ptr() as *const u8,
314                 iommu_info_argsz.argsz as usize,
315             )
316         };
317 
318         if (iommu_info[0].flags & VFIO_IOMMU_INFO_CAPS) == 0 {
319             return Err(VfioError::IommuGetCapInfo);
320         }
321 
322         let mut offset = iommu_info[0].cap_offset as usize;
323         while offset != 0 {
324             let header = extract_vfio_struct::<vfio_info_cap_header>(info_bytes, offset);
325 
326             if header.id == VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE as u16 && header.version == 1 {
327                 let iova_header = extract_vfio_struct::<vfio_iommu_type1_info_cap_iova_range_header>(
328                     info_bytes, offset,
329                 );
330                 let range_offset = offset + mem::size_of::<vfio_iommu_type1_info_cap_iova_range>();
331                 let mut ret = Vec::new();
332                 for i in 0..iova_header.nr_iovas {
333                     ret.push(extract_vfio_struct::<vfio_iova_range>(
334                         info_bytes,
335                         range_offset + i as usize * mem::size_of::<vfio_iova_range>(),
336                     ));
337                 }
338                 return Ok(ret
339                     .iter()
340                     .map(|range| AddressRange {
341                         start: range.start,
342                         end: range.end,
343                     })
344                     .collect());
345             }
346             offset = header.next as usize;
347         }
348 
349         Err(VfioError::IommuGetCapInfo)
350     }
351 
init_vfio_iommu(&mut self, hint: IommuMappingHint) -> Result<()>352     fn init_vfio_iommu(&mut self, hint: IommuMappingHint) -> Result<()> {
353         // If we expect granular, dynamic mappings (i.e. viommu/coiommu), try the
354         // ChromeOS Type1ChromeOS first, then fall back to upstream versions.
355         if hint == IommuMappingHint::Dynamic {
356             if self.set_iommu(IommuType::Type1ChromeOS) == 0 {
357                 return Ok(());
358             }
359         }
360 
361         if !self.check_extension(IommuType::Type1V2) {
362             return Err(VfioError::VfioType1V2);
363         }
364 
365         if self.set_iommu(IommuType::Type1V2) < 0 {
366             return Err(VfioError::ContainerSetIOMMU(get_error()));
367         }
368 
369         Ok(())
370     }
371 
get_group_with_vm( &mut self, id: u32, vm: &impl Vm, iommu_enabled: bool, ) -> Result<Arc<Mutex<VfioGroup>>>372     fn get_group_with_vm(
373         &mut self,
374         id: u32,
375         vm: &impl Vm,
376         iommu_enabled: bool,
377     ) -> Result<Arc<Mutex<VfioGroup>>> {
378         match self.groups.get(&id) {
379             Some(group) => Ok(group.clone()),
380             None => {
381                 let group = Arc::new(Mutex::new(VfioGroup::new(self, id)?));
382                 if self.groups.is_empty() {
383                     // Before the first group is added into container, do once per container
384                     // initialization. Both coiommu and virtio-iommu rely on small, dynamic
385                     // mappings. However, if an iommu is not enabled, then we map the entirety
386                     // of guest memory as a small number of large, static mappings.
387                     let mapping_hint = if iommu_enabled {
388                         IommuMappingHint::Dynamic
389                     } else {
390                         IommuMappingHint::Static
391                     };
392                     self.init_vfio_iommu(mapping_hint)?;
393 
394                     if !iommu_enabled {
395                         vm.get_memory().with_regions(
396                             |MemoryRegionInformation {
397                                  guest_addr,
398                                  size,
399                                  host_addr,
400                                  ..
401                              }| {
402                                 // Safe because the guest regions are guaranteed not to overlap
403                                 unsafe {
404                                     self.vfio_dma_map(
405                                         guest_addr.0,
406                                         size as u64,
407                                         host_addr as u64,
408                                         true,
409                                     )
410                                 }
411                             },
412                         )?;
413                     }
414                 }
415 
416                 let kvm_vfio_file = KVM_VFIO_FILE
417                     .get_or_try_init(|| vm.create_device(DeviceKind::Vfio))
418                     .map_err(VfioError::CreateVfioKvmDevice)?;
419                 group
420                     .lock()
421                     .kvm_device_set_group(kvm_vfio_file, KvmVfioGroupOps::Add)?;
422 
423                 self.groups.insert(id, group.clone());
424 
425                 Ok(group)
426             }
427         }
428     }
429 
get_group(&mut self, id: u32) -> Result<Arc<Mutex<VfioGroup>>>430     fn get_group(&mut self, id: u32) -> Result<Arc<Mutex<VfioGroup>>> {
431         match self.groups.get(&id) {
432             Some(group) => Ok(group.clone()),
433             None => {
434                 let group = Arc::new(Mutex::new(VfioGroup::new(self, id)?));
435 
436                 if self.groups.is_empty() {
437                     // Before the first group is added into container, do once per
438                     // container initialization.
439                     self.init_vfio_iommu(IommuMappingHint::Static)?;
440                 }
441 
442                 self.groups.insert(id, group.clone());
443                 Ok(group)
444             }
445         }
446     }
447 
remove_group(&mut self, id: u32, reduce: bool)448     fn remove_group(&mut self, id: u32, reduce: bool) {
449         let mut remove = false;
450 
451         if let Some(group) = self.groups.get(&id) {
452             if reduce {
453                 group.lock().reduce_device_num();
454             }
455             if group.lock().device_num() == 0 {
456                 let kvm_vfio_file = KVM_VFIO_FILE.get().expect("kvm vfio file isn't created");
457                 if group
458                     .lock()
459                     .kvm_device_set_group(kvm_vfio_file, KvmVfioGroupOps::Delete)
460                     .is_err()
461                 {
462                     warn!("failing in remove vfio group from kvm device");
463                 }
464                 remove = true;
465             }
466         }
467 
468         if remove {
469             self.groups.remove(&id);
470         }
471     }
472 
clone_as_raw_descriptor(&self) -> Result<RawDescriptor>473     pub fn clone_as_raw_descriptor(&self) -> Result<RawDescriptor> {
474         let raw_descriptor = unsafe { libc::dup(self.container.as_raw_descriptor()) };
475         if raw_descriptor < 0 {
476             Err(VfioError::ContainerDupError)
477         } else {
478             Ok(raw_descriptor)
479         }
480     }
481 
482     // Gets group ids for all groups in the container.
group_ids(&self) -> Vec<&u32>483     pub fn group_ids(&self) -> Vec<&u32> {
484         self.groups.keys().collect()
485     }
486 }
487 
488 impl AsRawDescriptor for VfioContainer {
as_raw_descriptor(&self) -> RawDescriptor489     fn as_raw_descriptor(&self) -> RawDescriptor {
490         self.container.as_raw_descriptor()
491     }
492 }
493 
494 struct VfioGroup {
495     group: File,
496     device_num: u32,
497 }
498 
499 impl VfioGroup {
new(container: &VfioContainer, id: u32) -> Result<Self>500     fn new(container: &VfioContainer, id: u32) -> Result<Self> {
501         let group_path = format!("/dev/vfio/{}", id);
502         let group_file = OpenOptions::new()
503             .read(true)
504             .write(true)
505             .open(Path::new(&group_path))
506             .map_err(|e| VfioError::OpenGroup(e, group_path))?;
507 
508         let mut group_status = vfio_group_status {
509             argsz: mem::size_of::<vfio_group_status>() as u32,
510             flags: 0,
511         };
512         // Safe as we are the owner of group_file and group_status which are valid value.
513         let mut ret =
514             unsafe { ioctl_with_mut_ref(&group_file, VFIO_GROUP_GET_STATUS(), &mut group_status) };
515         if ret < 0 {
516             return Err(VfioError::GetGroupStatus(get_error()));
517         }
518 
519         if group_status.flags != VFIO_GROUP_FLAGS_VIABLE {
520             return Err(VfioError::GroupViable);
521         }
522 
523         // Safe as we are the owner of group_file and container_raw_descriptor which are valid value,
524         // and we verify the ret value
525         let container_raw_descriptor = container.as_raw_descriptor();
526         ret = unsafe {
527             ioctl_with_ref(
528                 &group_file,
529                 VFIO_GROUP_SET_CONTAINER(),
530                 &container_raw_descriptor,
531             )
532         };
533         if ret < 0 {
534             return Err(VfioError::GroupSetContainer(get_error()));
535         }
536 
537         Ok(VfioGroup {
538             group: group_file,
539             device_num: 0,
540         })
541     }
542 
get_group_id<P: AsRef<Path>>(sysfspath: P) -> Result<u32>543     fn get_group_id<P: AsRef<Path>>(sysfspath: P) -> Result<u32> {
544         let mut uuid_path = PathBuf::new();
545         uuid_path.push(sysfspath);
546         uuid_path.push("iommu_group");
547         let group_path = uuid_path.read_link().map_err(|_| VfioError::InvalidPath)?;
548         let group_osstr = group_path.file_name().ok_or(VfioError::InvalidPath)?;
549         let group_str = group_osstr.to_str().ok_or(VfioError::InvalidPath)?;
550         let group_id = group_str
551             .parse::<u32>()
552             .map_err(|_| VfioError::InvalidPath)?;
553 
554         Ok(group_id)
555     }
556 
kvm_device_set_group( &self, kvm_vfio_file: &SafeDescriptor, ops: KvmVfioGroupOps, ) -> Result<()>557     fn kvm_device_set_group(
558         &self,
559         kvm_vfio_file: &SafeDescriptor,
560         ops: KvmVfioGroupOps,
561     ) -> Result<()> {
562         let group_descriptor = self.as_raw_descriptor();
563         let group_descriptor_ptr = &group_descriptor as *const i32;
564         let vfio_dev_attr = match ops {
565             KvmVfioGroupOps::Add => kvm_sys::kvm_device_attr {
566                 flags: 0,
567                 group: kvm_sys::KVM_DEV_VFIO_GROUP,
568                 attr: kvm_sys::KVM_DEV_VFIO_GROUP_ADD as u64,
569                 addr: group_descriptor_ptr as u64,
570             },
571             KvmVfioGroupOps::Delete => kvm_sys::kvm_device_attr {
572                 flags: 0,
573                 group: kvm_sys::KVM_DEV_VFIO_GROUP,
574                 attr: kvm_sys::KVM_DEV_VFIO_GROUP_DEL as u64,
575                 addr: group_descriptor_ptr as u64,
576             },
577         };
578 
579         // Safe as we are the owner of vfio_dev_descriptor and vfio_dev_attr which are valid value,
580         // and we verify the return value.
581         if 0 != unsafe {
582             ioctl_with_ref(
583                 kvm_vfio_file,
584                 kvm_sys::KVM_SET_DEVICE_ATTR(),
585                 &vfio_dev_attr,
586             )
587         } {
588             return Err(VfioError::KvmSetDeviceAttr(get_error()));
589         }
590 
591         Ok(())
592     }
593 
get_device(&self, name: &str) -> Result<File>594     fn get_device(&self, name: &str) -> Result<File> {
595         let path: CString = CString::new(name.as_bytes()).expect("CString::new() failed");
596         let path_ptr = path.as_ptr();
597 
598         // Safe as we are the owner of self and path_ptr which are valid value.
599         let ret = unsafe { ioctl_with_ptr(self, VFIO_GROUP_GET_DEVICE_FD(), path_ptr) };
600         if ret < 0 {
601             return Err(VfioError::GroupGetDeviceFD(get_error()));
602         }
603 
604         // Safe as ret is valid descriptor
605         Ok(unsafe { File::from_raw_descriptor(ret) })
606     }
607 
add_device_num(&mut self)608     fn add_device_num(&mut self) {
609         self.device_num += 1;
610     }
611 
reduce_device_num(&mut self)612     fn reduce_device_num(&mut self) {
613         self.device_num -= 1;
614     }
615 
device_num(&self) -> u32616     fn device_num(&self) -> u32 {
617         self.device_num
618     }
619 }
620 
621 impl AsRawDescriptor for VfioGroup {
as_raw_descriptor(&self) -> RawDescriptor622     fn as_raw_descriptor(&self) -> RawDescriptor {
623         self.group.as_raw_descriptor()
624     }
625 }
626 
627 /// A helper trait for managing VFIO setup
628 pub trait VfioCommonTrait: Send + Sync {
629     /// The single place to create a VFIO container for a PCI endpoint.
630     ///
631     /// The policy to determine whether an individual or a shared VFIO container
632     /// will be created for this device is governed by the physical PCI topology,
633     /// and the argument iommu_enabled.
634     ///
635     ///  # Arguments
636     ///
637     ///  * `sysfspath` - the path to the PCI device, e.g. /sys/bus/pci/devices/0000:02:00.0
638     ///  * `iommu_enabled` - whether virtio IOMMU is enabled on this device
vfio_get_container<P: AsRef<Path>>( iommu_dev: IommuDevType, sysfspath: Option<P>, ) -> Result<Arc<Mutex<VfioContainer>>>639     fn vfio_get_container<P: AsRef<Path>>(
640         iommu_dev: IommuDevType,
641         sysfspath: Option<P>,
642     ) -> Result<Arc<Mutex<VfioContainer>>>;
643 }
644 
645 thread_local! {
646 
647     // One VFIO container is shared by all VFIO devices that don't
648     // attach to the virtio IOMMU device
649     static NO_IOMMU_CONTAINER: RefCell<Option<Arc<Mutex<VfioContainer>>>> = RefCell::new(None);
650 
651     // For IOMMU enabled devices, all VFIO groups that share the same IOVA space
652     // are managed by one VFIO container
653     static IOMMU_CONTAINERS: RefCell<Option<Vec<Arc<Mutex<VfioContainer>>>>> = RefCell::new(Some(Default::default()));
654 
655     // One VFIO container is shared by all VFIO devices that
656     // attach to the CoIOMMU device
657     static COIOMMU_CONTAINER: RefCell<Option<Arc<Mutex<VfioContainer>>>> = RefCell::new(None);
658 }
659 
660 pub struct VfioCommonSetup;
661 
662 impl VfioCommonTrait for VfioCommonSetup {
vfio_get_container<P: AsRef<Path>>( iommu_dev: IommuDevType, sysfspath: Option<P>, ) -> Result<Arc<Mutex<VfioContainer>>>663     fn vfio_get_container<P: AsRef<Path>>(
664         iommu_dev: IommuDevType,
665         sysfspath: Option<P>,
666     ) -> Result<Arc<Mutex<VfioContainer>>> {
667         match iommu_dev {
668             IommuDevType::NoIommu => {
669                 // One VFIO container is used for all IOMMU disabled groups
670                 NO_IOMMU_CONTAINER.with(|v| {
671                     if v.borrow().is_some() {
672                         if let Some(ref container) = *v.borrow() {
673                             Ok(container.clone())
674                         } else {
675                             Err(VfioError::BorrowVfioContainer)
676                         }
677                     } else {
678                         let container = Arc::new(Mutex::new(VfioContainer::new()?));
679                         *v.borrow_mut() = Some(container.clone());
680                         Ok(container)
681                     }
682                 })
683             }
684             IommuDevType::VirtioIommu => {
685                 let path = sysfspath.ok_or(VfioError::InvalidPath)?;
686                 let group_id = VfioGroup::get_group_id(path)?;
687 
688                 // One VFIO container is used for all devices belong to one VFIO group
689                 // NOTE: vfio_wrapper relies on each container containing exactly one group.
690                 IOMMU_CONTAINERS.with(|v| {
691                     if let Some(ref mut containers) = *v.borrow_mut() {
692                         let container = containers
693                             .iter()
694                             .find(|container| container.lock().is_group_set(group_id));
695 
696                         match container {
697                             None => {
698                                 let container = Arc::new(Mutex::new(VfioContainer::new()?));
699                                 containers.push(container.clone());
700                                 Ok(container)
701                             }
702                             Some(container) => Ok(container.clone()),
703                         }
704                     } else {
705                         Err(VfioError::BorrowVfioContainer)
706                     }
707                 })
708             }
709             IommuDevType::CoIommu => {
710                 // One VFIO container is used for devices attached to CoIommu
711                 COIOMMU_CONTAINER.with(|v| {
712                     if v.borrow().is_some() {
713                         if let Some(ref container) = *v.borrow() {
714                             Ok(container.clone())
715                         } else {
716                             Err(VfioError::BorrowVfioContainer)
717                         }
718                     } else {
719                         let container = Arc::new(Mutex::new(VfioContainer::new()?));
720                         *v.borrow_mut() = Some(container.clone());
721                         Ok(container)
722                     }
723                 })
724             }
725         }
726     }
727 }
728 
729 /// Vfio Irq type used to enable/disable/mask/unmask vfio irq
730 pub enum VfioIrqType {
731     Intx,
732     Msi,
733     Msix,
734 }
735 
736 /// Vfio Irq information used to assign and enable/disable/mask/unmask vfio irq
737 pub struct VfioIrq {
738     pub flags: u32,
739     pub index: u32,
740 }
741 
742 /// Address on VFIO memory region.
743 #[derive(Debug, Default, Clone)]
744 pub struct VfioRegionAddr {
745     /// region number.
746     pub index: u32,
747     /// offset in the region.
748     pub addr: u64,
749 }
750 
751 #[derive(Debug)]
752 pub struct VfioRegion {
753     // flags for this region: read/write/mmap
754     flags: u32,
755     size: u64,
756     // region offset used to read/write with vfio device descriptor
757     offset: u64,
758     // vectors for mmap offset and size
759     mmaps: Vec<vfio_region_sparse_mmap_area>,
760     // type and subtype for cap type
761     cap_info: Option<(u32, u32)>,
762 }
763 
764 /// Vfio device for exposing regions which could be read/write to kernel vfio device.
765 pub struct VfioDevice {
766     dev: File,
767     name: String,
768     container: Arc<Mutex<VfioContainer>>,
769     dev_type: VfioDeviceType,
770     group_descriptor: RawDescriptor,
771     group_id: u32,
772     // vec for vfio device's regions
773     regions: Vec<VfioRegion>,
774     num_irqs: u32,
775 
776     iova_alloc: Arc<Mutex<AddressAllocator>>,
777 }
778 
779 impl VfioDevice {
780     /// Create a new vfio device, then guest read/write on this device could be
781     /// transfered into kernel vfio.
782     /// sysfspath specify the vfio device path in sys file system.
new_passthrough<P: AsRef<Path>>( sysfspath: &P, vm: &impl Vm, container: Arc<Mutex<VfioContainer>>, iommu_enabled: bool, ) -> Result<Self>783     pub fn new_passthrough<P: AsRef<Path>>(
784         sysfspath: &P,
785         vm: &impl Vm,
786         container: Arc<Mutex<VfioContainer>>,
787         iommu_enabled: bool,
788     ) -> Result<Self> {
789         let group_id = VfioGroup::get_group_id(sysfspath)?;
790 
791         let group = container
792             .lock()
793             .get_group_with_vm(group_id, vm, iommu_enabled)?;
794         let name_osstr = sysfspath
795             .as_ref()
796             .file_name()
797             .ok_or(VfioError::InvalidPath)?;
798         let name_str = name_osstr.to_str().ok_or(VfioError::InvalidPath)?;
799         let name = String::from(name_str);
800         let dev = group.lock().get_device(&name)?;
801         let (dev_info, dev_type) = Self::get_device_info(&dev)?;
802         let regions = Self::get_regions(&dev, dev_info.num_regions)?;
803         group.lock().add_device_num();
804         let group_descriptor = group.lock().as_raw_descriptor();
805 
806         let iova_ranges = container.lock().vfio_iommu_iova_get_iova_ranges()?;
807         let iova_alloc = AddressAllocator::new_from_list(iova_ranges, None, None)
808             .map_err(VfioError::Resources)?;
809 
810         Ok(VfioDevice {
811             dev,
812             name,
813             container,
814             dev_type,
815             group_descriptor,
816             group_id,
817             regions,
818             num_irqs: dev_info.num_irqs,
819             iova_alloc: Arc::new(Mutex::new(iova_alloc)),
820         })
821     }
822 
new<P: AsRef<Path>>( sysfspath: &P, container: Arc<Mutex<VfioContainer>>, ) -> Result<Self>823     pub fn new<P: AsRef<Path>>(
824         sysfspath: &P,
825         container: Arc<Mutex<VfioContainer>>,
826     ) -> Result<Self> {
827         let group_id = VfioGroup::get_group_id(sysfspath)?;
828         let group = container.lock().get_group(group_id)?;
829         let name_osstr = sysfspath
830             .as_ref()
831             .file_name()
832             .ok_or(VfioError::InvalidPath)?;
833         let name_str = name_osstr.to_str().ok_or(VfioError::InvalidPath)?;
834         let name = String::from(name_str);
835 
836         let dev = match group.lock().get_device(&name) {
837             Ok(dev) => dev,
838             Err(e) => {
839                 container.lock().remove_group(group_id, false);
840                 return Err(e);
841             }
842         };
843         let (dev_info, dev_type) = match Self::get_device_info(&dev) {
844             Ok(dev_info) => dev_info,
845             Err(e) => {
846                 container.lock().remove_group(group_id, false);
847                 return Err(e);
848             }
849         };
850         let regions = match Self::get_regions(&dev, dev_info.num_regions) {
851             Ok(regions) => regions,
852             Err(e) => {
853                 container.lock().remove_group(group_id, false);
854                 return Err(e);
855             }
856         };
857         group.lock().add_device_num();
858         let group_descriptor = group.lock().as_raw_descriptor();
859 
860         let iova_ranges = container.lock().vfio_iommu_iova_get_iova_ranges()?;
861         let iova_alloc = AddressAllocator::new_from_list(iova_ranges, None, None)
862             .map_err(VfioError::Resources)?;
863 
864         Ok(VfioDevice {
865             dev,
866             name,
867             container,
868             dev_type,
869             group_descriptor,
870             group_id,
871             regions,
872             num_irqs: dev_info.num_irqs,
873             iova_alloc: Arc::new(Mutex::new(iova_alloc)),
874         })
875     }
876 
877     /// Returns the file for this device.
dev_file(&self) -> &File878     pub fn dev_file(&self) -> &File {
879         &self.dev
880     }
881 
882     /// Returns PCI device name, formatted as BUS:DEVICE.FUNCTION string.
device_name(&self) -> &String883     pub fn device_name(&self) -> &String {
884         &self.name
885     }
886 
887     /// Returns the type of this VFIO device.
device_type(&self) -> VfioDeviceType888     pub fn device_type(&self) -> VfioDeviceType {
889         self.dev_type
890     }
891 
892     /// enter the device's low power state
pm_low_power_enter(&self) -> Result<()>893     pub fn pm_low_power_enter(&self) -> Result<()> {
894         let mut device_feature = vec_with_array_field::<vfio_device_feature, u8>(0);
895         device_feature[0].argsz = mem::size_of::<vfio_device_feature>() as u32;
896         device_feature[0].flags = VFIO_DEVICE_FEATURE_SET | VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY;
897         // Safe as we are the owner of self and power_management which are valid value
898         let ret = unsafe { ioctl_with_ref(&self.dev, VFIO_DEVICE_FEATURE(), &device_feature[0]) };
899         if ret < 0 {
900             Err(VfioError::VfioPmLowPowerEnter(get_error()))
901         } else {
902             Ok(())
903         }
904     }
905 
906     /// enter the device's low power state with wakeup notification
pm_low_power_enter_with_wakeup(&self, wakeup_evt: Event) -> Result<()>907     pub fn pm_low_power_enter_with_wakeup(&self, wakeup_evt: Event) -> Result<()> {
908         let payload = vfio_device_low_power_entry_with_wakeup {
909             wakeup_eventfd: wakeup_evt.as_raw_descriptor(),
910             reserved: 0,
911         };
912         let payload_size = mem::size_of::<vfio_device_low_power_entry_with_wakeup>();
913         let mut device_feature = vec_with_array_field::<vfio_device_feature, u8>(payload_size);
914         device_feature[0].argsz = (mem::size_of::<vfio_device_feature>() + payload_size) as u32;
915         device_feature[0].flags =
916             VFIO_DEVICE_FEATURE_SET | VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY_WITH_WAKEUP;
917         unsafe {
918             // Safe as we know vfio_device_low_power_entry_with_wakeup has two 32-bit int fields
919             device_feature[0]
920                 .data
921                 .as_mut_slice(payload_size)
922                 .copy_from_slice(
923                     mem::transmute::<vfio_device_low_power_entry_with_wakeup, [u8; 8]>(payload)
924                         .as_slice(),
925                 );
926         }
927         // Safe as we are the owner of self and power_management which are valid value
928         let ret = unsafe { ioctl_with_ref(&self.dev, VFIO_DEVICE_FEATURE(), &device_feature[0]) };
929         if ret < 0 {
930             Err(VfioError::VfioPmLowPowerEnter(get_error()))
931         } else {
932             Ok(())
933         }
934     }
935 
936     /// exit the device's low power state
pm_low_power_exit(&self) -> Result<()>937     pub fn pm_low_power_exit(&self) -> Result<()> {
938         let mut device_feature = vec_with_array_field::<vfio_device_feature, u8>(0);
939         device_feature[0].argsz = mem::size_of::<vfio_device_feature>() as u32;
940         device_feature[0].flags = VFIO_DEVICE_FEATURE_SET | VFIO_DEVICE_FEATURE_LOW_POWER_EXIT;
941         // Safe as we are the owner of self and power_management which are valid value
942         let ret = unsafe { ioctl_with_ref(&self.dev, VFIO_DEVICE_FEATURE(), &device_feature[0]) };
943         if ret < 0 {
944             Err(VfioError::VfioPmLowPowerExit(get_error()))
945         } else {
946             Ok(())
947         }
948     }
949 
950     /// Enable vfio device's irq and associate Irqfd Event with device.
951     /// When MSIx is enabled, multi vectors will be supported, and vectors starting from subindex to subindex +
952     /// descriptors length will be assigned with irqfd in the descriptors array.
953     /// when index = VFIO_PCI_REQ_IRQ_INDEX, kernel vfio will trigger this event when physical device
954     /// is removed.
955     /// If descriptor is None, -1 is assigned to the irq. A value of -1 is used to either de-assign
956     /// interrupts if already assigned or skip un-assigned interrupts.
irq_enable( &self, descriptors: &[Option<&Event>], index: u32, subindex: u32, ) -> Result<()>957     pub fn irq_enable(
958         &self,
959         descriptors: &[Option<&Event>],
960         index: u32,
961         subindex: u32,
962     ) -> Result<()> {
963         let count = descriptors.len();
964         let u32_size = mem::size_of::<u32>();
965         let mut irq_set = vec_with_array_field::<vfio_irq_set, u32>(count);
966         irq_set[0].argsz = (mem::size_of::<vfio_irq_set>() + count * u32_size) as u32;
967         irq_set[0].flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER;
968         irq_set[0].index = index;
969         irq_set[0].start = subindex;
970         irq_set[0].count = count as u32;
971 
972         // irq_set.data could be none, bool or descriptor according to flags, so irq_set.data
973         // is u8 default, here irq_set.data is descriptor as u32, so 4 default u8 are combined
974         // together as u32. It is safe as enough space is reserved through
975         // vec_with_array_field(u32)<count>.
976         let mut data = unsafe { irq_set[0].data.as_mut_slice(count * u32_size) };
977         for descriptor in descriptors.iter().take(count) {
978             let (left, right) = data.split_at_mut(u32_size);
979             match descriptor {
980                 Some(fd) => left.copy_from_slice(&fd.as_raw_descriptor().to_ne_bytes()[..]),
981                 None => left.copy_from_slice(&(-1i32).to_ne_bytes()[..]),
982             }
983             data = right;
984         }
985 
986         // Safe as we are the owner of self and irq_set which are valid value
987         let ret = unsafe { ioctl_with_ref(&self.dev, VFIO_DEVICE_SET_IRQS(), &irq_set[0]) };
988         if ret < 0 {
989             Err(VfioError::VfioIrqEnable(get_error()))
990         } else {
991             Ok(())
992         }
993     }
994 
995     /// When intx is enabled, irqfd is used to trigger a level interrupt into guest, resample irqfd
996     /// is used to get guest EOI notification.
997     /// When host hw generates interrupt, vfio irq handler in host kernel receive and handle it,
998     /// this handler disable hw irq first, then trigger irqfd to inject interrupt into guest. When
999     /// resample irqfd is triggered by guest EOI, vfio kernel could enable hw irq, so hw could
1000     /// generate another interrupts.
1001     /// This function enable resample irqfd and let vfio kernel could get EOI notification.
1002     ///
1003     /// descriptor: should be resample IrqFd.
resample_virq_enable(&self, descriptor: &Event, index: u32) -> Result<()>1004     pub fn resample_virq_enable(&self, descriptor: &Event, index: u32) -> Result<()> {
1005         let mut irq_set = vec_with_array_field::<vfio_irq_set, u32>(1);
1006         irq_set[0].argsz = (mem::size_of::<vfio_irq_set>() + mem::size_of::<u32>()) as u32;
1007         irq_set[0].flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_UNMASK;
1008         irq_set[0].index = index;
1009         irq_set[0].start = 0;
1010         irq_set[0].count = 1;
1011 
1012         {
1013             // irq_set.data could be none, bool or descriptor according to flags, so irq_set.data is
1014             // u8 default, here irq_set.data is descriptor as u32, so 4 default u8 are combined
1015             // together as u32. It is safe as enough space is reserved through
1016             // vec_with_array_field(u32)<1>.
1017             let descriptors = unsafe { irq_set[0].data.as_mut_slice(4) };
1018             descriptors.copy_from_slice(&descriptor.as_raw_descriptor().to_le_bytes()[..]);
1019         }
1020 
1021         // Safe as we are the owner of self and irq_set which are valid value
1022         let ret = unsafe { ioctl_with_ref(&self.dev, VFIO_DEVICE_SET_IRQS(), &irq_set[0]) };
1023         if ret < 0 {
1024             Err(VfioError::VfioIrqEnable(get_error()))
1025         } else {
1026             Ok(())
1027         }
1028     }
1029 
1030     /// disable vfio device's irq and disconnect Irqfd Event with device
irq_disable(&self, index: u32) -> Result<()>1031     pub fn irq_disable(&self, index: u32) -> Result<()> {
1032         let mut irq_set = vec_with_array_field::<vfio_irq_set, u32>(0);
1033         irq_set[0].argsz = mem::size_of::<vfio_irq_set>() as u32;
1034         irq_set[0].flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
1035         irq_set[0].index = index;
1036         irq_set[0].start = 0;
1037         irq_set[0].count = 0;
1038 
1039         // Safe as we are the owner of self and irq_set which are valid value
1040         let ret = unsafe { ioctl_with_ref(&self.dev, VFIO_DEVICE_SET_IRQS(), &irq_set[0]) };
1041         if ret < 0 {
1042             Err(VfioError::VfioIrqDisable(get_error()))
1043         } else {
1044             Ok(())
1045         }
1046     }
1047 
1048     /// Unmask vfio device irq
irq_unmask(&self, index: u32) -> Result<()>1049     pub fn irq_unmask(&self, index: u32) -> Result<()> {
1050         let mut irq_set = vec_with_array_field::<vfio_irq_set, u32>(0);
1051         irq_set[0].argsz = mem::size_of::<vfio_irq_set>() as u32;
1052         irq_set[0].flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_UNMASK;
1053         irq_set[0].index = index;
1054         irq_set[0].start = 0;
1055         irq_set[0].count = 1;
1056 
1057         // Safe as we are the owner of self and irq_set which are valid value
1058         let ret = unsafe { ioctl_with_ref(&self.dev, VFIO_DEVICE_SET_IRQS(), &irq_set[0]) };
1059         if ret < 0 {
1060             Err(VfioError::VfioIrqUnmask(get_error()))
1061         } else {
1062             Ok(())
1063         }
1064     }
1065 
1066     /// Mask vfio device irq
irq_mask(&self, index: u32) -> Result<()>1067     pub fn irq_mask(&self, index: u32) -> Result<()> {
1068         let mut irq_set = vec_with_array_field::<vfio_irq_set, u32>(0);
1069         irq_set[0].argsz = mem::size_of::<vfio_irq_set>() as u32;
1070         irq_set[0].flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_MASK;
1071         irq_set[0].index = index;
1072         irq_set[0].start = 0;
1073         irq_set[0].count = 1;
1074 
1075         // Safe as we are the owner of self and irq_set which are valid value
1076         let ret = unsafe { ioctl_with_ref(&self.dev, VFIO_DEVICE_SET_IRQS(), &irq_set[0]) };
1077         if ret < 0 {
1078             Err(VfioError::VfioIrqMask(get_error()))
1079         } else {
1080             Ok(())
1081         }
1082     }
1083 
1084     /// Get and validate VFIO device information.
get_device_info(device_file: &File) -> Result<(vfio_device_info, VfioDeviceType)>1085     fn get_device_info(device_file: &File) -> Result<(vfio_device_info, VfioDeviceType)> {
1086         let mut dev_info = vfio_device_info {
1087             argsz: mem::size_of::<vfio_device_info>() as u32,
1088             flags: 0,
1089             num_regions: 0,
1090             num_irqs: 0,
1091             ..Default::default()
1092         };
1093 
1094         // Safe as we are the owner of device_file and dev_info which are valid value,
1095         // and we verify the return value.
1096         let ret = unsafe { ioctl_with_mut_ref(device_file, VFIO_DEVICE_GET_INFO(), &mut dev_info) };
1097         if ret < 0 {
1098             return Err(VfioError::VfioDeviceGetInfo(get_error()));
1099         }
1100 
1101         let dev_type = if (dev_info.flags & VFIO_DEVICE_FLAGS_PCI) != 0 {
1102             if dev_info.num_regions < VFIO_PCI_CONFIG_REGION_INDEX + 1
1103                 || dev_info.num_irqs < VFIO_PCI_MSIX_IRQ_INDEX + 1
1104             {
1105                 return Err(VfioError::VfioDeviceGetInfo(get_error()));
1106             }
1107 
1108             VfioDeviceType::Pci
1109         } else if (dev_info.flags & VFIO_DEVICE_FLAGS_PLATFORM) != 0 {
1110             VfioDeviceType::Platform
1111         } else {
1112             return Err(VfioError::UnknownDeviceType(dev_info.flags));
1113         };
1114 
1115         Ok((dev_info, dev_type))
1116     }
1117 
1118     /// Query interrupt information
1119     /// return: Vector of interrupts information, each of which contains flags and index
get_irqs(&self) -> Result<Vec<VfioIrq>>1120     pub fn get_irqs(&self) -> Result<Vec<VfioIrq>> {
1121         let mut irqs: Vec<VfioIrq> = Vec::new();
1122 
1123         for i in 0..self.num_irqs {
1124             let argsz = mem::size_of::<vfio_irq_info>() as u32;
1125             let mut irq_info = vfio_irq_info {
1126                 argsz,
1127                 flags: 0,
1128                 index: i,
1129                 count: 0,
1130             };
1131             // Safe as we are the owner of dev and irq_info which are valid value,
1132             // and we verify the return value.
1133             let ret = unsafe {
1134                 ioctl_with_mut_ref(
1135                     self.device_file(),
1136                     VFIO_DEVICE_GET_IRQ_INFO(),
1137                     &mut irq_info,
1138                 )
1139             };
1140             if ret < 0 || irq_info.count != 1 {
1141                 return Err(VfioError::VfioDeviceGetInfo(get_error()));
1142             }
1143 
1144             let irq = VfioIrq {
1145                 flags: irq_info.flags,
1146                 index: irq_info.index,
1147             };
1148             irqs.push(irq);
1149         }
1150         Ok(irqs)
1151     }
1152 
1153     #[allow(clippy::cast_ptr_alignment)]
get_regions(dev: &File, num_regions: u32) -> Result<Vec<VfioRegion>>1154     fn get_regions(dev: &File, num_regions: u32) -> Result<Vec<VfioRegion>> {
1155         let mut regions: Vec<VfioRegion> = Vec::new();
1156         for i in 0..num_regions {
1157             let argsz = mem::size_of::<vfio_region_info>() as u32;
1158             let mut reg_info = vfio_region_info {
1159                 argsz,
1160                 flags: 0,
1161                 index: i,
1162                 cap_offset: 0,
1163                 size: 0,
1164                 offset: 0,
1165             };
1166             // Safe as we are the owner of dev and reg_info which are valid value,
1167             // and we verify the return value.
1168             let ret =
1169                 unsafe { ioctl_with_mut_ref(dev, VFIO_DEVICE_GET_REGION_INFO(), &mut reg_info) };
1170             if ret < 0 {
1171                 continue;
1172             }
1173 
1174             let mut mmaps: Vec<vfio_region_sparse_mmap_area> = Vec::new();
1175             let mut cap_info: Option<(u32, u32)> = None;
1176             if reg_info.argsz > argsz {
1177                 let cap_len: usize = (reg_info.argsz - argsz) as usize;
1178                 let mut region_with_cap =
1179                     vec_with_array_field::<vfio_region_info_with_cap, u8>(cap_len);
1180                 region_with_cap[0].region_info.argsz = reg_info.argsz;
1181                 region_with_cap[0].region_info.flags = 0;
1182                 region_with_cap[0].region_info.index = i;
1183                 region_with_cap[0].region_info.cap_offset = 0;
1184                 region_with_cap[0].region_info.size = 0;
1185                 region_with_cap[0].region_info.offset = 0;
1186                 // Safe as we are the owner of dev and region_info which are valid value,
1187                 // and we verify the return value.
1188                 let ret = unsafe {
1189                     ioctl_with_mut_ref(
1190                         dev,
1191                         VFIO_DEVICE_GET_REGION_INFO(),
1192                         &mut (region_with_cap[0].region_info),
1193                     )
1194                 };
1195                 if ret < 0 {
1196                     return Err(VfioError::VfioDeviceGetRegionInfo(get_error()));
1197                 }
1198 
1199                 if region_with_cap[0].region_info.flags & VFIO_REGION_INFO_FLAG_CAPS == 0 {
1200                     continue;
1201                 }
1202 
1203                 let cap_header_sz = mem::size_of::<vfio_info_cap_header>() as u32;
1204                 let mmap_cap_sz = mem::size_of::<vfio_region_info_cap_sparse_mmap>() as u32;
1205                 let mmap_area_sz = mem::size_of::<vfio_region_sparse_mmap_area>() as u32;
1206                 let type_cap_sz = mem::size_of::<vfio_region_info_cap_type>() as u32;
1207                 let region_info_sz = reg_info.argsz;
1208 
1209                 // region_with_cap[0].cap_info may contain many structures, like
1210                 // vfio_region_info_cap_sparse_mmap struct or vfio_region_info_cap_type struct.
1211                 // Both of them begin with vfio_info_cap_header, so we will get individual cap from
1212                 // vfio_into_cap_header.
1213                 // Go through all the cap structs.
1214                 let info_ptr = region_with_cap.as_ptr() as *mut u8;
1215                 let mut offset = region_with_cap[0].region_info.cap_offset;
1216                 while offset != 0 {
1217                     if offset + cap_header_sz > region_info_sz {
1218                         break;
1219                     }
1220                     // Safe, as cap_header struct is in this function allocated region_with_cap
1221                     // vec.
1222                     let cap_ptr = unsafe { info_ptr.offset(offset as isize) };
1223                     let cap_header =
1224                         unsafe { &*(cap_ptr as *mut u8 as *const vfio_info_cap_header) };
1225                     if cap_header.id as u32 == VFIO_REGION_INFO_CAP_SPARSE_MMAP {
1226                         if offset + mmap_cap_sz > region_info_sz {
1227                             break;
1228                         }
1229                         // cap_ptr is vfio_region_info_cap_sparse_mmap here
1230                         // Safe, this vfio_region_info_cap_sparse_mmap is in this function allocated
1231                         // region_with_cap vec.
1232                         let sparse_mmap = unsafe {
1233                             &*(cap_ptr as *mut u8 as *const vfio_region_info_cap_sparse_mmap)
1234                         };
1235 
1236                         let area_num = sparse_mmap.nr_areas;
1237                         if offset + mmap_cap_sz + area_num * mmap_area_sz > region_info_sz {
1238                             break;
1239                         }
1240                         // Safe, these vfio_region_sparse_mmap_area are in this function allocated
1241                         // region_with_cap vec.
1242                         let areas =
1243                             unsafe { sparse_mmap.areas.as_slice(sparse_mmap.nr_areas as usize) };
1244                         for area in areas.iter() {
1245                             mmaps.push(*area);
1246                         }
1247                     } else if cap_header.id as u32 == VFIO_REGION_INFO_CAP_TYPE {
1248                         if offset + type_cap_sz > region_info_sz {
1249                             break;
1250                         }
1251                         // cap_ptr is vfio_region_info_cap_type here
1252                         // Safe, this vfio_region_info_cap_type is in this function allocated
1253                         // region_with_cap vec
1254                         let cap_type_info =
1255                             unsafe { &*(cap_ptr as *mut u8 as *const vfio_region_info_cap_type) };
1256 
1257                         cap_info = Some((cap_type_info.type_, cap_type_info.subtype));
1258                     } else if cap_header.id as u32 == VFIO_REGION_INFO_CAP_MSIX_MAPPABLE {
1259                         mmaps.push(vfio_region_sparse_mmap_area {
1260                             offset: 0,
1261                             size: region_with_cap[0].region_info.size,
1262                         });
1263                     }
1264 
1265                     offset = cap_header.next;
1266                 }
1267             } else if reg_info.flags & VFIO_REGION_INFO_FLAG_MMAP != 0 {
1268                 mmaps.push(vfio_region_sparse_mmap_area {
1269                     offset: 0,
1270                     size: reg_info.size,
1271                 });
1272             }
1273 
1274             let region = VfioRegion {
1275                 flags: reg_info.flags,
1276                 size: reg_info.size,
1277                 offset: reg_info.offset,
1278                 mmaps,
1279                 cap_info,
1280             };
1281             regions.push(region);
1282         }
1283 
1284         Ok(regions)
1285     }
1286 
1287     /// get a region's flag
1288     /// the return's value may conatin:
1289     ///     VFIO_REGION_INFO_FLAG_READ:  region supports read
1290     ///     VFIO_REGION_INFO_FLAG_WRITE: region supports write
1291     ///     VFIO_REGION_INFO_FLAG_MMAP:  region supports mmap
1292     ///     VFIO_REGION_INFO_FLAG_CAPS:  region's info supports caps
get_region_flags(&self, index: u32) -> u321293     pub fn get_region_flags(&self, index: u32) -> u32 {
1294         match self.regions.get(index as usize) {
1295             Some(v) => v.flags,
1296             None => {
1297                 warn!("get_region_flags() with invalid index: {}", index);
1298                 0
1299             }
1300         }
1301     }
1302 
1303     /// get a region's offset
1304     /// return: Region offset from the start of vfio device descriptor
get_region_offset(&self, index: u32) -> u641305     pub fn get_region_offset(&self, index: u32) -> u64 {
1306         match self.regions.get(index as usize) {
1307             Some(v) => v.offset,
1308             None => {
1309                 warn!("get_region_offset with invalid index: {}", index);
1310                 0
1311             }
1312         }
1313     }
1314 
1315     /// get a region's size
1316     /// return: Region size from the start of vfio device descriptor
get_region_size(&self, index: u32) -> u641317     pub fn get_region_size(&self, index: u32) -> u64 {
1318         match self.regions.get(index as usize) {
1319             Some(v) => v.size,
1320             None => {
1321                 warn!("get_region_size with invalid index: {}", index);
1322                 0
1323             }
1324         }
1325     }
1326 
1327     /// get a number of regions
1328     /// return: Number of regions of vfio device descriptor
get_region_count(&self) -> u321329     pub fn get_region_count(&self) -> u32 {
1330         self.regions.len() as u32
1331     }
1332 
1333     /// get a region's mmap info vector
get_region_mmap(&self, index: u32) -> Vec<vfio_region_sparse_mmap_area>1334     pub fn get_region_mmap(&self, index: u32) -> Vec<vfio_region_sparse_mmap_area> {
1335         match self.regions.get(index as usize) {
1336             Some(v) => v.mmaps.clone(),
1337             None => {
1338                 warn!("get_region_mmap with invalid index: {}", index);
1339                 Vec::new()
1340             }
1341         }
1342     }
1343 
1344     /// find the specified cap type in device regions
1345     /// Input:
1346     ///      type_:  cap type
1347     ///      sub_type: cap sub_type
1348     /// Output:
1349     ///     None: device doesn't have the specified cap type
1350     ///     Some((bar_index, region_size)): device has the specified cap type, return region's
1351     ///                                     index and size
get_cap_type_info(&self, type_: u32, sub_type: u32) -> Option<(u32, u64)>1352     pub fn get_cap_type_info(&self, type_: u32, sub_type: u32) -> Option<(u32, u64)> {
1353         for (index, region) in self.regions.iter().enumerate() {
1354             if let Some(cap_info) = &region.cap_info {
1355                 if cap_info.0 == type_ && cap_info.1 == sub_type {
1356                     return Some((index as u32, region.size));
1357                 }
1358             }
1359         }
1360 
1361         None
1362     }
1363 
1364     /// Returns file offset corresponding to the given `VfioRegionAddr`.
1365     /// The offset can be used when reading/writing the VFIO device's FD directly.
get_offset_for_addr(&self, addr: &VfioRegionAddr) -> Result<u64>1366     pub fn get_offset_for_addr(&self, addr: &VfioRegionAddr) -> Result<u64> {
1367         let region = self
1368             .regions
1369             .get(addr.index as usize)
1370             .ok_or(VfioError::InvalidIndex(addr.index))?;
1371         Ok(region.offset + addr.addr)
1372     }
1373 
1374     /// Read region's data from VFIO device into buf
1375     /// index: region num
1376     /// buf: data destination and buf length is read size
1377     /// addr: offset in the region
region_read(&self, index: u32, buf: &mut [u8], addr: u64)1378     pub fn region_read(&self, index: u32, buf: &mut [u8], addr: u64) {
1379         let stub: &VfioRegion = self
1380             .regions
1381             .get(index as usize)
1382             .unwrap_or_else(|| panic!("tried to read VFIO with an invalid index: {}", index));
1383 
1384         let size = buf.len() as u64;
1385         if size > stub.size || addr + size > stub.size {
1386             panic!(
1387                 "tried to read VFIO region with invalid arguments: index={}, addr=0x{:x}, size=0x{:x}",
1388                 index, addr, size
1389             );
1390         }
1391 
1392         self.dev
1393             .read_exact_at(buf, stub.offset + addr)
1394             .unwrap_or_else(|e| {
1395                 panic!(
1396                     "failed to read region: index={}, addr=0x{:x}, error={}",
1397                     index, addr, e
1398                 )
1399             });
1400     }
1401 
1402     /// Reads a value from the specified `VfioRegionAddr.addr` + `offset`.
region_read_from_addr<T: FromBytes>(&self, addr: &VfioRegionAddr, offset: u64) -> T1403     pub fn region_read_from_addr<T: FromBytes>(&self, addr: &VfioRegionAddr, offset: u64) -> T {
1404         let mut val = mem::MaybeUninit::zeroed();
1405         // Safe because we have zero-initialized `size_of::<T>()` bytes.
1406         let buf =
1407             unsafe { slice::from_raw_parts_mut(val.as_mut_ptr() as *mut u8, mem::size_of::<T>()) };
1408         self.region_read(addr.index, buf, addr.addr + offset);
1409         // Safe because any bit pattern is valid for a type that implements FromBytes.
1410         unsafe { val.assume_init() }
1411     }
1412 
1413     /// write the data from buf into a vfio device region
1414     /// index: region num
1415     /// buf: data src and buf length is write size
1416     /// addr: offset in the region
region_write(&self, index: u32, buf: &[u8], addr: u64)1417     pub fn region_write(&self, index: u32, buf: &[u8], addr: u64) {
1418         let stub: &VfioRegion = self
1419             .regions
1420             .get(index as usize)
1421             .unwrap_or_else(|| panic!("tried to write VFIO with an invalid index: {}", index));
1422 
1423         let size = buf.len() as u64;
1424         if size > stub.size
1425             || addr + size > stub.size
1426             || (stub.flags & VFIO_REGION_INFO_FLAG_WRITE) == 0
1427         {
1428             panic!(
1429                 "tried to write VFIO region with invalid arguments: index={}, addr=0x{:x}, size=0x{:x}",
1430                 index, addr, size
1431             );
1432         }
1433 
1434         self.dev
1435             .write_all_at(buf, stub.offset + addr)
1436             .unwrap_or_else(|e| {
1437                 panic!(
1438                     "failed to write region: index={}, addr=0x{:x}, error={}",
1439                     index, addr, e
1440                 )
1441             });
1442     }
1443 
1444     /// Writes data into the specified `VfioRegionAddr.addr` + `offset`.
region_write_to_addr<T: AsBytes>(&self, val: &T, addr: &VfioRegionAddr, offset: u64)1445     pub fn region_write_to_addr<T: AsBytes>(&self, val: &T, addr: &VfioRegionAddr, offset: u64) {
1446         self.region_write(addr.index, val.as_bytes(), addr.addr + offset);
1447     }
1448 
1449     /// get vfio device's descriptors which are passed into minijail process
keep_rds(&self) -> Vec<RawDescriptor>1450     pub fn keep_rds(&self) -> Vec<RawDescriptor> {
1451         vec![
1452             self.dev.as_raw_descriptor(),
1453             self.group_descriptor,
1454             self.container.lock().as_raw_descriptor(),
1455         ]
1456     }
1457 
1458     /// Add (iova, user_addr) map into vfio container iommu table
vfio_dma_map( &self, iova: u64, size: u64, user_addr: u64, write_en: bool, ) -> Result<()>1459     pub unsafe fn vfio_dma_map(
1460         &self,
1461         iova: u64,
1462         size: u64,
1463         user_addr: u64,
1464         write_en: bool,
1465     ) -> Result<()> {
1466         self.container
1467             .lock()
1468             .vfio_dma_map(iova, size, user_addr, write_en)
1469     }
1470 
1471     /// Remove (iova, user_addr) map from vfio container iommu table
vfio_dma_unmap(&self, iova: u64, size: u64) -> Result<()>1472     pub fn vfio_dma_unmap(&self, iova: u64, size: u64) -> Result<()> {
1473         self.container.lock().vfio_dma_unmap(iova, size)
1474     }
1475 
vfio_get_iommu_page_size_mask(&self) -> Result<u64>1476     pub fn vfio_get_iommu_page_size_mask(&self) -> Result<u64> {
1477         self.container.lock().vfio_get_iommu_page_size_mask()
1478     }
1479 
alloc_iova(&self, size: u64, align_size: u64, alloc: Alloc) -> Result<u64>1480     pub fn alloc_iova(&self, size: u64, align_size: u64, alloc: Alloc) -> Result<u64> {
1481         self.iova_alloc
1482             .lock()
1483             .allocate_with_align(size, alloc, "alloc_iova".to_owned(), align_size)
1484             .map_err(VfioError::Resources)
1485     }
1486 
get_iova(&self, alloc: &Alloc) -> Option<AddressRange>1487     pub fn get_iova(&self, alloc: &Alloc) -> Option<AddressRange> {
1488         self.iova_alloc.lock().get(alloc).map(|res| res.0)
1489     }
1490 
release_iova(&self, alloc: Alloc) -> Result<AddressRange>1491     pub fn release_iova(&self, alloc: Alloc) -> Result<AddressRange> {
1492         self.iova_alloc
1493             .lock()
1494             .release(alloc)
1495             .map_err(VfioError::Resources)
1496     }
1497 
get_max_addr(&self) -> u641498     pub fn get_max_addr(&self) -> u64 {
1499         self.iova_alloc.lock().get_max_addr()
1500     }
1501 
1502     /// Gets the vfio device backing `File`.
device_file(&self) -> &File1503     pub fn device_file(&self) -> &File {
1504         &self.dev
1505     }
1506 
1507     /// close vfio device
close(&self)1508     pub fn close(&self) {
1509         self.container.lock().remove_group(self.group_id, true);
1510     }
1511 }
1512 
1513 pub struct VfioPciConfig {
1514     device: Arc<VfioDevice>,
1515 }
1516 
1517 impl VfioPciConfig {
new(device: Arc<VfioDevice>) -> Self1518     pub fn new(device: Arc<VfioDevice>) -> Self {
1519         VfioPciConfig { device }
1520     }
1521 
read_config<T: FromBytes>(&self, offset: u32) -> T1522     pub fn read_config<T: FromBytes>(&self, offset: u32) -> T {
1523         let mut buf = vec![0u8; std::mem::size_of::<T>()];
1524         self.device
1525             .region_read(VFIO_PCI_CONFIG_REGION_INDEX, &mut buf, offset.into());
1526         T::read_from(&buf[..]).expect("failed to convert config data from slice")
1527     }
1528 
write_config<T: AsBytes>(&self, config: T, offset: u32)1529     pub fn write_config<T: AsBytes>(&self, config: T, offset: u32) {
1530         self.device.region_write(
1531             VFIO_PCI_CONFIG_REGION_INDEX,
1532             config.as_bytes(),
1533             offset.into(),
1534         );
1535     }
1536 
1537     /// Set the VFIO device this config refers to as the bus master.
set_bus_master(&self)1538     pub fn set_bus_master(&self) {
1539         /// Constant definitions from `linux/pci_regs.h`.
1540         const PCI_COMMAND: u32 = 0x4;
1541         /// Enable bus mastering
1542         const PCI_COMMAND_MASTER: u16 = 0x4;
1543 
1544         let mut cmd: u16 = self.read_config(PCI_COMMAND);
1545 
1546         if cmd & PCI_COMMAND_MASTER != 0 {
1547             return;
1548         }
1549 
1550         cmd |= PCI_COMMAND_MASTER;
1551 
1552         self.write_config(cmd, PCI_COMMAND);
1553     }
1554 }
1555 
1556 impl AsRawDescriptor for VfioDevice {
as_raw_descriptor(&self) -> RawDescriptor1557     fn as_raw_descriptor(&self) -> RawDescriptor {
1558         self.dev.as_raw_descriptor()
1559     }
1560 }
1561