• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2019 The ChromiumOS Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 use std::cell::RefCell;
6 use std::collections::HashMap;
7 use std::ffi::CString;
8 use std::fs::File;
9 use std::fs::OpenOptions;
10 use std::io;
11 use std::mem;
12 use std::os::raw::c_ulong;
13 use std::os::unix::prelude::FileExt;
14 use std::path::Path;
15 use std::path::PathBuf;
16 #[cfg(all(target_os = "android", target_arch = "aarch64"))]
17 use std::ptr::addr_of_mut;
18 use std::slice;
19 use std::sync::Arc;
20 use std::u32;
21 
22 use base::error;
23 use base::ioctl;
24 use base::ioctl_with_mut_ptr;
25 use base::ioctl_with_mut_ref;
26 use base::ioctl_with_ptr;
27 use base::ioctl_with_ref;
28 use base::ioctl_with_val;
29 use base::warn;
30 use base::AsRawDescriptor;
31 use base::Error;
32 use base::Event;
33 use base::FromRawDescriptor;
34 use base::RawDescriptor;
35 use base::SafeDescriptor;
36 use cfg_if::cfg_if;
37 use data_model::vec_with_array_field;
38 use hypervisor::DeviceKind;
39 use hypervisor::Vm;
40 use once_cell::sync::OnceCell;
41 use rand::seq::index::sample;
42 use rand::thread_rng;
43 use remain::sorted;
44 use resources::address_allocator::AddressAllocator;
45 use resources::AddressRange;
46 use resources::Alloc;
47 use resources::Error as ResourcesError;
48 use sync::Mutex;
49 use thiserror::Error;
50 use vfio_sys::vfio::vfio_acpi_dsm;
51 use vfio_sys::vfio::VFIO_IRQ_SET_DATA_BOOL;
52 use vfio_sys::*;
53 use zerocopy::AsBytes;
54 use zerocopy::FromBytes;
55 
56 use crate::IommuDevType;
57 
58 #[sorted]
59 #[derive(Error, Debug)]
60 pub enum VfioError {
61     #[error("failed to borrow global vfio container")]
62     BorrowVfioContainer,
63     #[error("failed to duplicate VfioContainer")]
64     ContainerDupError,
65     #[error("failed to set container's IOMMU driver type as {0:?}: {1}")]
66     ContainerSetIOMMU(IommuType, Error),
67     #[error("failed to create KVM vfio device: {0}")]
68     CreateVfioKvmDevice(Error),
69     #[error("failed to get Group Status: {0}")]
70     GetGroupStatus(Error),
71     #[error("failed to get vfio device fd: {0}")]
72     GroupGetDeviceFD(Error),
73     #[error("failed to add vfio group into vfio container: {0}")]
74     GroupSetContainer(Error),
75     #[error("group is inviable")]
76     GroupViable,
77     #[error("invalid region index: {0}")]
78     InvalidIndex(usize),
79     #[error("invalid operation")]
80     InvalidOperation,
81     #[error("invalid file path")]
82     InvalidPath,
83     #[error("failed to add guest memory map into iommu table: {0}")]
84     IommuDmaMap(Error),
85     #[error("failed to remove guest memory map from iommu table: {0}")]
86     IommuDmaUnmap(Error),
87     #[error("failed to get IOMMU cap info from host")]
88     IommuGetCapInfo,
89     #[error("failed to get IOMMU info from host: {0}")]
90     IommuGetInfo(Error),
91     #[error("failed to attach device to pKVM pvIOMMU: {0}")]
92     KvmPviommuSetConfig(Error),
93     #[error("failed to set KVM vfio device's attribute: {0}")]
94     KvmSetDeviceAttr(Error),
95     #[error("AddressAllocator is unavailable")]
96     NoRescAlloc,
97     #[error("failed to open /dev/vfio/vfio container: {0}")]
98     OpenContainer(io::Error),
99     #[error("failed to open {1} group: {0}")]
100     OpenGroup(io::Error, String),
101     #[error("failed to read {1} link: {0}")]
102     ReadLink(io::Error, PathBuf),
103     #[error("resources error: {0}")]
104     Resources(ResourcesError),
105     #[error("unknown vfio device type (flags: {0:#x})")]
106     UnknownDeviceType(u32),
107     #[error("failed to call vfio device's ACPI _DSM: {0}")]
108     VfioAcpiDsm(Error),
109     #[error("failed to disable vfio deviece's acpi notification: {0}")]
110     VfioAcpiNotificationDisable(Error),
111     #[error("failed to enable vfio deviece's acpi notification: {0}")]
112     VfioAcpiNotificationEnable(Error),
113     #[error("failed to test vfio deviece's acpi notification: {0}")]
114     VfioAcpiNotificationTest(Error),
115     #[error(
116         "vfio API version doesn't match with VFIO_API_VERSION defined in vfio_sys/src/vfio.rs"
117     )]
118     VfioApiVersion,
119     #[error("failed to get vfio device's info or info doesn't match: {0}")]
120     VfioDeviceGetInfo(Error),
121     #[error("failed to get vfio device's region info: {0}")]
122     VfioDeviceGetRegionInfo(Error),
123     #[error("container doesn't support IOMMU driver type {0:?}")]
124     VfioIommuSupport(IommuType),
125     #[error("failed to disable vfio deviece's irq: {0}")]
126     VfioIrqDisable(Error),
127     #[error("failed to enable vfio deviece's irq: {0}")]
128     VfioIrqEnable(Error),
129     #[error("failed to mask vfio deviece's irq: {0}")]
130     VfioIrqMask(Error),
131     #[error("failed to unmask vfio deviece's irq: {0}")]
132     VfioIrqUnmask(Error),
133     #[error("failed to enter vfio deviece's low power state: {0}")]
134     VfioPmLowPowerEnter(Error),
135     #[error("failed to exit vfio deviece's low power state: {0}")]
136     VfioPmLowPowerExit(Error),
137 }
138 
139 type Result<T> = std::result::Result<T, VfioError>;
140 
get_error() -> Error141 fn get_error() -> Error {
142     Error::last()
143 }
144 
145 static KVM_VFIO_FILE: OnceCell<SafeDescriptor> = OnceCell::new();
146 
147 #[derive(Copy, Clone, Debug, PartialEq, Eq)]
148 pub enum VfioDeviceType {
149     Pci,
150     Platform,
151 }
152 
153 enum KvmVfioGroupOps {
154     Add,
155     Delete,
156 }
157 
158 #[derive(Debug)]
159 pub struct KvmVfioPviommu {
160     file: File,
161 }
162 
163 impl KvmVfioPviommu {
new(vm: &impl Vm) -> Result<Self>164     pub fn new(vm: &impl Vm) -> Result<Self> {
165         cfg_if! {
166             if #[cfg(all(target_os = "android", target_arch = "aarch64"))] {
167                 let file = Self::ioctl_kvm_dev_vfio_pviommu_attach(vm)?;
168 
169                 Ok(Self { file })
170             } else {
171                 let _ = vm;
172                 unimplemented!()
173             }
174         }
175     }
176 
attach<T: AsRawDescriptor>(&self, device: &T, sid_idx: u32, vsid: u32) -> Result<()>177     pub fn attach<T: AsRawDescriptor>(&self, device: &T, sid_idx: u32, vsid: u32) -> Result<()> {
178         cfg_if! {
179             if #[cfg(all(target_os = "android", target_arch = "aarch64"))] {
180                 self.ioctl_kvm_pviommu_set_config(device, sid_idx, vsid)
181             } else {
182                 let _ = device;
183                 let _ = sid_idx;
184                 let _ = vsid;
185                 unimplemented!()
186             }
187         }
188     }
189 
id(&self) -> u32190     pub fn id(&self) -> u32 {
191         let fd = self.as_raw_descriptor();
192         // Guests identify pvIOMMUs to the hypervisor using the corresponding VMM FDs.
193         fd.try_into().unwrap()
194     }
195 
get_sid_count<T: AsRawDescriptor>(vm: &impl Vm, device: &T) -> Result<u32>196     pub fn get_sid_count<T: AsRawDescriptor>(vm: &impl Vm, device: &T) -> Result<u32> {
197         cfg_if! {
198             if #[cfg(all(target_os = "android", target_arch = "aarch64"))] {
199                 let info = Self::ioctl_kvm_dev_vfio_pviommu_get_info(vm, device)?;
200 
201                 Ok(info.nr_sids)
202             } else {
203                 let _ = vm;
204                 let _ = device;
205                 unimplemented!()
206             }
207         }
208     }
209 
210     #[cfg(all(target_os = "android", target_arch = "aarch64"))]
ioctl_kvm_dev_vfio_pviommu_attach(vm: &impl Vm) -> Result<File>211     fn ioctl_kvm_dev_vfio_pviommu_attach(vm: &impl Vm) -> Result<File> {
212         let kvm_vfio_file = KVM_VFIO_FILE
213             .get_or_try_init(|| vm.create_device(DeviceKind::Vfio))
214             .map_err(VfioError::CreateVfioKvmDevice)?;
215 
216         let vfio_dev_attr = kvm_sys::kvm_device_attr {
217             flags: 0,
218             group: kvm_sys::KVM_DEV_VFIO_PVIOMMU,
219             attr: kvm_sys::KVM_DEV_VFIO_PVIOMMU_ATTACH as u64,
220             addr: 0,
221         };
222 
223         // SAFETY:
224         // Safe as we are the owner of vfio_dev_attr, which is valid.
225         let ret = unsafe {
226             ioctl_with_ref(
227                 kvm_vfio_file,
228                 kvm_sys::KVM_SET_DEVICE_ATTR(),
229                 &vfio_dev_attr,
230             )
231         };
232 
233         if ret < 0 {
234             Err(VfioError::KvmSetDeviceAttr(get_error()))
235         } else {
236             // Safe as we verify the return value.
237             Ok(unsafe { File::from_raw_descriptor(ret) })
238         }
239     }
240 
241     #[cfg(all(target_os = "android", target_arch = "aarch64"))]
ioctl_kvm_pviommu_set_config<T: AsRawDescriptor>( &self, device: &T, sid_idx: u32, vsid: u32, ) -> Result<()>242     fn ioctl_kvm_pviommu_set_config<T: AsRawDescriptor>(
243         &self,
244         device: &T,
245         sid_idx: u32,
246         vsid: u32,
247     ) -> Result<()> {
248         let config = kvm_sys::kvm_vfio_iommu_config {
249             device_fd: device.as_raw_descriptor(),
250             sid_idx,
251             vsid,
252         };
253 
254         // SAFETY:
255         // Safe as we are the owner of device and config which are valid, and we verify the return
256         // value.
257         let ret = unsafe { ioctl_with_ref(self, kvm_sys::KVM_PVIOMMU_SET_CONFIG, &config) };
258 
259         if ret < 0 {
260             Err(VfioError::KvmPviommuSetConfig(get_error()))
261         } else {
262             Ok(())
263         }
264     }
265 
266     #[cfg(all(target_os = "android", target_arch = "aarch64"))]
ioctl_kvm_dev_vfio_pviommu_get_info<T: AsRawDescriptor>( vm: &impl Vm, device: &T, ) -> Result<kvm_sys::kvm_vfio_iommu_info>267     fn ioctl_kvm_dev_vfio_pviommu_get_info<T: AsRawDescriptor>(
268         vm: &impl Vm,
269         device: &T,
270     ) -> Result<kvm_sys::kvm_vfio_iommu_info> {
271         let kvm_vfio_file = KVM_VFIO_FILE
272             .get_or_try_init(|| vm.create_device(DeviceKind::Vfio))
273             .map_err(VfioError::CreateVfioKvmDevice)?;
274 
275         let mut info = kvm_sys::kvm_vfio_iommu_info {
276             device_fd: device.as_raw_descriptor(),
277             nr_sids: 0,
278         };
279 
280         let vfio_dev_attr = kvm_sys::kvm_device_attr {
281             flags: 0,
282             group: kvm_sys::KVM_DEV_VFIO_PVIOMMU,
283             attr: kvm_sys::KVM_DEV_VFIO_PVIOMMU_GET_INFO as u64,
284             addr: addr_of_mut!(info) as usize as u64,
285         };
286 
287         // SAFETY:
288         // Safe as we are the owner of vfio_dev_attr, which is valid.
289         let ret = unsafe {
290             ioctl_with_ref(
291                 kvm_vfio_file,
292                 kvm_sys::KVM_SET_DEVICE_ATTR(),
293                 &vfio_dev_attr,
294             )
295         };
296 
297         if ret < 0 {
298             Err(VfioError::KvmSetDeviceAttr(get_error()))
299         } else {
300             Ok(info)
301         }
302     }
303 }
304 
305 impl AsRawDescriptor for KvmVfioPviommu {
as_raw_descriptor(&self) -> RawDescriptor306     fn as_raw_descriptor(&self) -> RawDescriptor {
307         self.file.as_raw_descriptor()
308     }
309 }
310 
311 #[repr(u32)]
312 #[derive(Copy, Clone, Debug, PartialEq, Eq)]
313 pub enum IommuType {
314     Type1V2 = VFIO_TYPE1v2_IOMMU,
315     PkvmPviommu = VFIO_PKVM_PVIOMMU,
316     // ChromeOS specific vfio_iommu_type1 implementation that is optimized for
317     // small, dynamic mappings. For clients which create large, relatively
318     // static mappings, Type1V2 is still preferred.
319     //
320     // See crrev.com/c/3593528 for the implementation.
321     Type1ChromeOS = 100001,
322 }
323 
324 /// VfioContainer contain multi VfioGroup, and delegate an IOMMU domain table
325 pub struct VfioContainer {
326     container: File,
327     groups: HashMap<u32, Arc<Mutex<VfioGroup>>>,
328     iommu_type: Option<IommuType>,
329 }
330 
extract_vfio_struct<T>(bytes: &[u8], offset: usize) -> Option<T> where T: FromBytes,331 fn extract_vfio_struct<T>(bytes: &[u8], offset: usize) -> Option<T>
332 where
333     T: FromBytes,
334 {
335     bytes.get(offset..).and_then(T::read_from_prefix)
336 }
337 
338 const VFIO_API_VERSION: u8 = 0;
339 impl VfioContainer {
new() -> Result<Self>340     pub fn new() -> Result<Self> {
341         let container = OpenOptions::new()
342             .read(true)
343             .write(true)
344             .open("/dev/vfio/vfio")
345             .map_err(VfioError::OpenContainer)?;
346 
347         Self::new_from_container(container)
348     }
349 
350     // Construct a VfioContainer from an exist container file.
new_from_container(container: File) -> Result<Self>351     pub fn new_from_container(container: File) -> Result<Self> {
352         // SAFETY:
353         // Safe as file is vfio container descriptor and ioctl is defined by kernel.
354         let version = unsafe { ioctl(&container, VFIO_GET_API_VERSION()) };
355         if version as u8 != VFIO_API_VERSION {
356             return Err(VfioError::VfioApiVersion);
357         }
358 
359         Ok(VfioContainer {
360             container,
361             groups: HashMap::new(),
362             iommu_type: None,
363         })
364     }
365 
is_group_set(&self, group_id: u32) -> bool366     fn is_group_set(&self, group_id: u32) -> bool {
367         self.groups.get(&group_id).is_some()
368     }
369 
check_extension(&self, val: IommuType) -> bool370     fn check_extension(&self, val: IommuType) -> bool {
371         // SAFETY:
372         // Safe as file is vfio container and make sure val is valid.
373         let ret = unsafe { ioctl_with_val(self, VFIO_CHECK_EXTENSION(), val as c_ulong) };
374         ret != 0
375     }
376 
set_iommu(&mut self, val: IommuType) -> i32377     fn set_iommu(&mut self, val: IommuType) -> i32 {
378         // SAFETY:
379         // Safe as file is vfio container and make sure val is valid.
380         unsafe { ioctl_with_val(self, VFIO_SET_IOMMU(), val as c_ulong) }
381     }
382 
set_iommu_checked(&mut self, val: IommuType) -> Result<()>383     fn set_iommu_checked(&mut self, val: IommuType) -> Result<()> {
384         if !self.check_extension(val) {
385             Err(VfioError::VfioIommuSupport(val))
386         } else if self.set_iommu(val) != 0 {
387             Err(VfioError::ContainerSetIOMMU(val, get_error()))
388         } else {
389             self.iommu_type = Some(val);
390             Ok(())
391         }
392     }
393 
394     /// # Safety
395     ///
396     /// The caller is responsible for determining the safety of the VFIO_IOMMU_MAP_DMA ioctl.
vfio_dma_map( &self, iova: u64, size: u64, user_addr: u64, write_en: bool, ) -> Result<()>397     pub unsafe fn vfio_dma_map(
398         &self,
399         iova: u64,
400         size: u64,
401         user_addr: u64,
402         write_en: bool,
403     ) -> Result<()> {
404         match self
405             .iommu_type
406             .expect("vfio_dma_map called before configuring IOMMU")
407         {
408             IommuType::Type1V2 | IommuType::Type1ChromeOS => {
409                 self.vfio_iommu_type1_dma_map(iova, size, user_addr, write_en)
410             }
411             IommuType::PkvmPviommu => Err(VfioError::InvalidOperation),
412         }
413     }
414 
415     /// # Safety
416     ///
417     /// The caller is responsible for determining the safety of the VFIO_IOMMU_MAP_DMA ioctl.
vfio_iommu_type1_dma_map( &self, iova: u64, size: u64, user_addr: u64, write_en: bool, ) -> Result<()>418     unsafe fn vfio_iommu_type1_dma_map(
419         &self,
420         iova: u64,
421         size: u64,
422         user_addr: u64,
423         write_en: bool,
424     ) -> Result<()> {
425         let mut dma_map = vfio_iommu_type1_dma_map {
426             argsz: mem::size_of::<vfio_iommu_type1_dma_map>() as u32,
427             flags: VFIO_DMA_MAP_FLAG_READ,
428             vaddr: user_addr,
429             iova,
430             size,
431         };
432 
433         if write_en {
434             dma_map.flags |= VFIO_DMA_MAP_FLAG_WRITE;
435         }
436 
437         let ret = ioctl_with_ref(self, VFIO_IOMMU_MAP_DMA(), &dma_map);
438         if ret != 0 {
439             return Err(VfioError::IommuDmaMap(get_error()));
440         }
441 
442         Ok(())
443     }
444 
vfio_dma_unmap(&self, iova: u64, size: u64) -> Result<()>445     pub fn vfio_dma_unmap(&self, iova: u64, size: u64) -> Result<()> {
446         match self
447             .iommu_type
448             .expect("vfio_dma_unmap called before configuring IOMMU")
449         {
450             IommuType::Type1V2 | IommuType::Type1ChromeOS => {
451                 self.vfio_iommu_type1_dma_unmap(iova, size)
452             }
453             IommuType::PkvmPviommu => Err(VfioError::InvalidOperation),
454         }
455     }
456 
vfio_iommu_type1_dma_unmap(&self, iova: u64, size: u64) -> Result<()>457     fn vfio_iommu_type1_dma_unmap(&self, iova: u64, size: u64) -> Result<()> {
458         let mut dma_unmap = vfio_iommu_type1_dma_unmap {
459             argsz: mem::size_of::<vfio_iommu_type1_dma_unmap>() as u32,
460             flags: 0,
461             iova,
462             size,
463             ..Default::default()
464         };
465 
466         // SAFETY:
467         // Safe as file is vfio container, dma_unmap is constructed by us, and
468         // we check the return value
469         let ret = unsafe { ioctl_with_mut_ref(self, VFIO_IOMMU_UNMAP_DMA(), &mut dma_unmap) };
470         if ret != 0 || dma_unmap.size != size {
471             return Err(VfioError::IommuDmaUnmap(get_error()));
472         }
473 
474         Ok(())
475     }
476 
vfio_get_iommu_page_size_mask(&self) -> Result<u64>477     pub fn vfio_get_iommu_page_size_mask(&self) -> Result<u64> {
478         match self
479             .iommu_type
480             .expect("vfio_get_iommu_page_size_mask called before configuring IOMMU")
481         {
482             IommuType::Type1V2 | IommuType::Type1ChromeOS => {
483                 self.vfio_iommu_type1_get_iommu_page_size_mask()
484             }
485             IommuType::PkvmPviommu => Ok(0),
486         }
487     }
488 
vfio_iommu_type1_get_iommu_page_size_mask(&self) -> Result<u64>489     fn vfio_iommu_type1_get_iommu_page_size_mask(&self) -> Result<u64> {
490         let mut iommu_info = vfio_iommu_type1_info {
491             argsz: mem::size_of::<vfio_iommu_type1_info>() as u32,
492             flags: 0,
493             iova_pgsizes: 0,
494             ..Default::default()
495         };
496 
497         // SAFETY:
498         // Safe as file is vfio container, iommu_info has valid values,
499         // and we check the return value
500         let ret = unsafe { ioctl_with_mut_ref(self, VFIO_IOMMU_GET_INFO(), &mut iommu_info) };
501         if ret != 0 || (iommu_info.flags & VFIO_IOMMU_INFO_PGSIZES) == 0 {
502             return Err(VfioError::IommuGetInfo(get_error()));
503         }
504 
505         Ok(iommu_info.iova_pgsizes)
506     }
507 
vfio_iommu_iova_get_iova_ranges(&self) -> Result<Vec<AddressRange>>508     pub fn vfio_iommu_iova_get_iova_ranges(&self) -> Result<Vec<AddressRange>> {
509         match self
510             .iommu_type
511             .expect("vfio_iommu_iova_get_iova_ranges called before configuring IOMMU")
512         {
513             IommuType::Type1V2 | IommuType::Type1ChromeOS => {
514                 self.vfio_iommu_type1_get_iova_ranges()
515             }
516             IommuType::PkvmPviommu => Ok(Vec::new()),
517         }
518     }
519 
vfio_iommu_type1_get_iova_ranges(&self) -> Result<Vec<AddressRange>>520     fn vfio_iommu_type1_get_iova_ranges(&self) -> Result<Vec<AddressRange>> {
521         // Query the buffer size needed fetch the capabilities.
522         let mut iommu_info_argsz = vfio_iommu_type1_info {
523             argsz: mem::size_of::<vfio_iommu_type1_info>() as u32,
524             flags: 0,
525             iova_pgsizes: 0,
526             ..Default::default()
527         };
528 
529         // SAFETY:
530         // Safe as file is vfio container, iommu_info_argsz has valid values,
531         // and we check the return value
532         let ret = unsafe { ioctl_with_mut_ref(self, VFIO_IOMMU_GET_INFO(), &mut iommu_info_argsz) };
533         if ret != 0 {
534             return Err(VfioError::IommuGetInfo(get_error()));
535         }
536 
537         if (iommu_info_argsz.flags & VFIO_IOMMU_INFO_CAPS) == 0 {
538             return Err(VfioError::IommuGetCapInfo);
539         }
540 
541         let mut iommu_info = vec_with_array_field::<vfio_iommu_type1_info, u8>(
542             iommu_info_argsz.argsz as usize - mem::size_of::<vfio_iommu_type1_info>(),
543         );
544         iommu_info[0].argsz = iommu_info_argsz.argsz;
545         let ret =
546             // SAFETY:
547             // Safe as file is vfio container, iommu_info has valid values,
548             // and we check the return value
549             unsafe { ioctl_with_mut_ptr(self, VFIO_IOMMU_GET_INFO(), iommu_info.as_mut_ptr()) };
550         if ret != 0 {
551             return Err(VfioError::IommuGetInfo(get_error()));
552         }
553 
554         // SAFETY:
555         // Safe because we initialized iommu_info with enough space, u8 has less strict
556         // alignment, and since it will no longer be mutated.
557         let info_bytes = unsafe {
558             std::slice::from_raw_parts(
559                 iommu_info.as_ptr() as *const u8,
560                 iommu_info_argsz.argsz as usize,
561             )
562         };
563 
564         if (iommu_info[0].flags & VFIO_IOMMU_INFO_CAPS) == 0 {
565             return Err(VfioError::IommuGetCapInfo);
566         }
567 
568         let mut offset = iommu_info[0].cap_offset as usize;
569         while offset != 0 {
570             let header = extract_vfio_struct::<vfio_info_cap_header>(info_bytes, offset)
571                 .ok_or(VfioError::IommuGetCapInfo)?;
572 
573             if header.id == VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE as u16 && header.version == 1 {
574                 let iova_header =
575                     extract_vfio_struct::<vfio_iommu_type1_info_cap_iova_range_header>(
576                         info_bytes, offset,
577                     )
578                     .ok_or(VfioError::IommuGetCapInfo)?;
579                 let range_offset = offset + mem::size_of::<vfio_iommu_type1_info_cap_iova_range>();
580                 let mut ret = Vec::new();
581                 for i in 0..iova_header.nr_iovas {
582                     ret.push(
583                         extract_vfio_struct::<vfio_iova_range>(
584                             info_bytes,
585                             range_offset + i as usize * mem::size_of::<vfio_iova_range>(),
586                         )
587                         .ok_or(VfioError::IommuGetCapInfo)?,
588                     );
589                 }
590                 return Ok(ret
591                     .iter()
592                     .map(|range| AddressRange {
593                         start: range.start,
594                         end: range.end,
595                     })
596                     .collect());
597             }
598             offset = header.next as usize;
599         }
600 
601         Err(VfioError::IommuGetCapInfo)
602     }
603 
set_iommu_from(&mut self, iommu_dev: IommuDevType) -> Result<()>604     fn set_iommu_from(&mut self, iommu_dev: IommuDevType) -> Result<()> {
605         match iommu_dev {
606             IommuDevType::CoIommu | IommuDevType::VirtioIommu => {
607                 // If we expect granular, dynamic mappings, try the ChromeOS Type1ChromeOS first,
608                 // then fall back to upstream versions.
609                 self.set_iommu_checked(IommuType::Type1ChromeOS)
610                     .or_else(|_| self.set_iommu_checked(IommuType::Type1V2))
611             }
612             IommuDevType::NoIommu => self.set_iommu_checked(IommuType::Type1V2),
613             IommuDevType::PkvmPviommu => self.set_iommu_checked(IommuType::PkvmPviommu),
614         }
615     }
616 
get_group_with_vm( &mut self, id: u32, vm: &impl Vm, iommu_dev: IommuDevType, ) -> Result<Arc<Mutex<VfioGroup>>>617     fn get_group_with_vm(
618         &mut self,
619         id: u32,
620         vm: &impl Vm,
621         iommu_dev: IommuDevType,
622     ) -> Result<Arc<Mutex<VfioGroup>>> {
623         if let Some(group) = self.groups.get(&id) {
624             return Ok(group.clone());
625         }
626 
627         let group = Arc::new(Mutex::new(VfioGroup::new(self, id)?));
628         if self.groups.is_empty() {
629             self.set_iommu_from(iommu_dev)?;
630             // Before the first group is added into container, do once per container
631             // initialization. Both coiommu and virtio-iommu rely on small, dynamic
632             // mappings. However, if an iommu is not enabled, then we map the entirety
633             // of guest memory as a small number of large, static mappings.
634             match iommu_dev {
635                 IommuDevType::CoIommu | IommuDevType::PkvmPviommu | IommuDevType::VirtioIommu => {}
636                 IommuDevType::NoIommu => {
637                     for region in vm.get_memory().regions() {
638                         // SAFETY:
639                         // Safe because the guest regions are guaranteed not to overlap
640                         unsafe {
641                             self.vfio_dma_map(
642                                 region.guest_addr.0,
643                                 region.size as u64,
644                                 region.host_addr as u64,
645                                 true,
646                             )
647                         }?;
648                     }
649                 }
650             }
651         }
652 
653         let kvm_vfio_file = KVM_VFIO_FILE
654             .get_or_try_init(|| vm.create_device(DeviceKind::Vfio))
655             .map_err(VfioError::CreateVfioKvmDevice)?;
656         group
657             .lock()
658             .kvm_device_set_group(kvm_vfio_file, KvmVfioGroupOps::Add)?;
659 
660         self.groups.insert(id, group.clone());
661 
662         Ok(group)
663     }
664 
get_group(&mut self, id: u32) -> Result<Arc<Mutex<VfioGroup>>>665     fn get_group(&mut self, id: u32) -> Result<Arc<Mutex<VfioGroup>>> {
666         if let Some(group) = self.groups.get(&id) {
667             return Ok(group.clone());
668         }
669 
670         let group = Arc::new(Mutex::new(VfioGroup::new(self, id)?));
671 
672         if self.groups.is_empty() {
673             // Before the first group is added into container, do once per
674             // container initialization.
675             self.set_iommu_checked(IommuType::Type1V2)?;
676         }
677 
678         self.groups.insert(id, group.clone());
679         Ok(group)
680     }
681 
remove_group(&mut self, id: u32, reduce: bool)682     fn remove_group(&mut self, id: u32, reduce: bool) {
683         let mut remove = false;
684 
685         if let Some(group) = self.groups.get(&id) {
686             if reduce {
687                 group.lock().reduce_device_num();
688             }
689             if group.lock().device_num() == 0 {
690                 let kvm_vfio_file = KVM_VFIO_FILE.get().expect("kvm vfio file isn't created");
691                 if group
692                     .lock()
693                     .kvm_device_set_group(kvm_vfio_file, KvmVfioGroupOps::Delete)
694                     .is_err()
695                 {
696                     warn!("failing in remove vfio group from kvm device");
697                 }
698                 remove = true;
699             }
700         }
701 
702         if remove {
703             self.groups.remove(&id);
704         }
705     }
706 
clone_as_raw_descriptor(&self) -> Result<RawDescriptor>707     pub fn clone_as_raw_descriptor(&self) -> Result<RawDescriptor> {
708         // SAFETY: this call is safe because it doesn't modify any memory and we
709         // check the return value.
710         let raw_descriptor = unsafe { libc::dup(self.container.as_raw_descriptor()) };
711         if raw_descriptor < 0 {
712             Err(VfioError::ContainerDupError)
713         } else {
714             Ok(raw_descriptor)
715         }
716     }
717 
718     // Gets group ids for all groups in the container.
group_ids(&self) -> Vec<&u32>719     pub fn group_ids(&self) -> Vec<&u32> {
720         self.groups.keys().collect()
721     }
722 }
723 
724 impl AsRawDescriptor for VfioContainer {
as_raw_descriptor(&self) -> RawDescriptor725     fn as_raw_descriptor(&self) -> RawDescriptor {
726         self.container.as_raw_descriptor()
727     }
728 }
729 
730 struct VfioGroup {
731     group: File,
732     device_num: u32,
733 }
734 
735 impl VfioGroup {
new(container: &VfioContainer, id: u32) -> Result<Self>736     fn new(container: &VfioContainer, id: u32) -> Result<Self> {
737         let group_path = format!("/dev/vfio/{}", id);
738         let group_file = OpenOptions::new()
739             .read(true)
740             .write(true)
741             .open(Path::new(&group_path))
742             .map_err(|e| VfioError::OpenGroup(e, group_path))?;
743 
744         let mut group_status = vfio_group_status {
745             argsz: mem::size_of::<vfio_group_status>() as u32,
746             flags: 0,
747         };
748         let mut ret =
749             // SAFETY:
750             // Safe as we are the owner of group_file and group_status which are valid value.
751             unsafe { ioctl_with_mut_ref(&group_file, VFIO_GROUP_GET_STATUS(), &mut group_status) };
752         if ret < 0 {
753             return Err(VfioError::GetGroupStatus(get_error()));
754         }
755 
756         if group_status.flags != VFIO_GROUP_FLAGS_VIABLE {
757             return Err(VfioError::GroupViable);
758         }
759 
760         let container_raw_descriptor = container.as_raw_descriptor();
761         // SAFETY:
762         // Safe as we are the owner of group_file and container_raw_descriptor which are valid
763         // value, and we verify the ret value
764         ret = unsafe {
765             ioctl_with_ref(
766                 &group_file,
767                 VFIO_GROUP_SET_CONTAINER(),
768                 &container_raw_descriptor,
769             )
770         };
771         if ret < 0 {
772             return Err(VfioError::GroupSetContainer(get_error()));
773         }
774 
775         Ok(VfioGroup {
776             group: group_file,
777             device_num: 0,
778         })
779     }
780 
get_group_id<P: AsRef<Path>>(sysfspath: P) -> Result<u32>781     fn get_group_id<P: AsRef<Path>>(sysfspath: P) -> Result<u32> {
782         let mut uuid_path = PathBuf::new();
783         uuid_path.push(sysfspath);
784         uuid_path.push("iommu_group");
785         let group_path = uuid_path
786             .read_link()
787             .map_err(|e| VfioError::ReadLink(e, uuid_path))?;
788         let group_osstr = group_path.file_name().ok_or(VfioError::InvalidPath)?;
789         let group_str = group_osstr.to_str().ok_or(VfioError::InvalidPath)?;
790         let group_id = group_str
791             .parse::<u32>()
792             .map_err(|_| VfioError::InvalidPath)?;
793 
794         Ok(group_id)
795     }
796 
kvm_device_set_group( &self, kvm_vfio_file: &SafeDescriptor, ops: KvmVfioGroupOps, ) -> Result<()>797     fn kvm_device_set_group(
798         &self,
799         kvm_vfio_file: &SafeDescriptor,
800         ops: KvmVfioGroupOps,
801     ) -> Result<()> {
802         let group_descriptor = self.as_raw_descriptor();
803         let group_descriptor_ptr = &group_descriptor as *const i32;
804         let vfio_dev_attr = match ops {
805             KvmVfioGroupOps::Add => kvm_sys::kvm_device_attr {
806                 flags: 0,
807                 group: kvm_sys::KVM_DEV_VFIO_GROUP,
808                 attr: kvm_sys::KVM_DEV_VFIO_GROUP_ADD as u64,
809                 addr: group_descriptor_ptr as u64,
810             },
811             KvmVfioGroupOps::Delete => kvm_sys::kvm_device_attr {
812                 flags: 0,
813                 group: kvm_sys::KVM_DEV_VFIO_GROUP,
814                 attr: kvm_sys::KVM_DEV_VFIO_GROUP_DEL as u64,
815                 addr: group_descriptor_ptr as u64,
816             },
817         };
818 
819         // SAFETY:
820         // Safe as we are the owner of vfio_dev_descriptor and vfio_dev_attr which are valid value,
821         // and we verify the return value.
822         if 0 != unsafe {
823             ioctl_with_ref(
824                 kvm_vfio_file,
825                 kvm_sys::KVM_SET_DEVICE_ATTR(),
826                 &vfio_dev_attr,
827             )
828         } {
829             return Err(VfioError::KvmSetDeviceAttr(get_error()));
830         }
831 
832         Ok(())
833     }
834 
get_device(&self, name: &str) -> Result<File>835     fn get_device(&self, name: &str) -> Result<File> {
836         let path: CString = CString::new(name.as_bytes()).expect("CString::new() failed");
837         let path_ptr = path.as_ptr();
838 
839         // SAFETY:
840         // Safe as we are the owner of self and path_ptr which are valid value.
841         let ret = unsafe { ioctl_with_ptr(self, VFIO_GROUP_GET_DEVICE_FD(), path_ptr) };
842         if ret < 0 {
843             return Err(VfioError::GroupGetDeviceFD(get_error()));
844         }
845 
846         // SAFETY:
847         // Safe as ret is valid descriptor
848         Ok(unsafe { File::from_raw_descriptor(ret) })
849     }
850 
add_device_num(&mut self)851     fn add_device_num(&mut self) {
852         self.device_num += 1;
853     }
854 
reduce_device_num(&mut self)855     fn reduce_device_num(&mut self) {
856         self.device_num -= 1;
857     }
858 
device_num(&self) -> u32859     fn device_num(&self) -> u32 {
860         self.device_num
861     }
862 }
863 
864 impl AsRawDescriptor for VfioGroup {
as_raw_descriptor(&self) -> RawDescriptor865     fn as_raw_descriptor(&self) -> RawDescriptor {
866         self.group.as_raw_descriptor()
867     }
868 }
869 
870 /// A helper trait for managing VFIO setup
871 pub trait VfioCommonTrait: Send + Sync {
872     /// The single place to create a VFIO container for a PCI endpoint.
873     ///
874     /// The policy to determine whether an individual or a shared VFIO container
875     /// will be created for this device is governed by the physical PCI topology,
876     /// and the argument iommu_enabled.
877     ///
878     ///  # Arguments
879     ///
880     ///  * `sysfspath` - the path to the PCI device, e.g. /sys/bus/pci/devices/0000:02:00.0
881     ///  * `iommu_enabled` - whether virtio IOMMU is enabled on this device
vfio_get_container<P: AsRef<Path>>( iommu_dev: IommuDevType, sysfspath: Option<P>, ) -> Result<Arc<Mutex<VfioContainer>>>882     fn vfio_get_container<P: AsRef<Path>>(
883         iommu_dev: IommuDevType,
884         sysfspath: Option<P>,
885     ) -> Result<Arc<Mutex<VfioContainer>>>;
886 }
887 
888 thread_local! {
889 
890     // One VFIO container is shared by all VFIO devices that don't
891     // attach to the virtio IOMMU device
892     static NO_IOMMU_CONTAINER: RefCell<Option<Arc<Mutex<VfioContainer>>>> = RefCell::new(None);
893 
894     // For IOMMU enabled devices, all VFIO groups that share the same IOVA space
895     // are managed by one VFIO container
896     static IOMMU_CONTAINERS: RefCell<Option<Vec<Arc<Mutex<VfioContainer>>>>> = RefCell::new(Some(Default::default()));
897 
898     // One VFIO container is shared by all VFIO devices that
899     // attach to the CoIOMMU device
900     static COIOMMU_CONTAINER: RefCell<Option<Arc<Mutex<VfioContainer>>>> = RefCell::new(None);
901 
902     // One VFIO container is shared by all VFIO devices that attach to pKVM
903     static PKVM_IOMMU_CONTAINER: RefCell<Option<Arc<Mutex<VfioContainer>>>> = RefCell::new(None);
904 }
905 
906 pub struct VfioCommonSetup;
907 
908 impl VfioCommonTrait for VfioCommonSetup {
vfio_get_container<P: AsRef<Path>>( iommu_dev: IommuDevType, sysfspath: Option<P>, ) -> Result<Arc<Mutex<VfioContainer>>>909     fn vfio_get_container<P: AsRef<Path>>(
910         iommu_dev: IommuDevType,
911         sysfspath: Option<P>,
912     ) -> Result<Arc<Mutex<VfioContainer>>> {
913         match iommu_dev {
914             IommuDevType::NoIommu => {
915                 // One VFIO container is used for all IOMMU disabled groups
916                 NO_IOMMU_CONTAINER.with(|v| {
917                     if v.borrow().is_some() {
918                         if let Some(ref container) = *v.borrow() {
919                             Ok(container.clone())
920                         } else {
921                             Err(VfioError::BorrowVfioContainer)
922                         }
923                     } else {
924                         let container = Arc::new(Mutex::new(VfioContainer::new()?));
925                         *v.borrow_mut() = Some(container.clone());
926                         Ok(container)
927                     }
928                 })
929             }
930             IommuDevType::VirtioIommu => {
931                 let path = sysfspath.ok_or(VfioError::InvalidPath)?;
932                 let group_id = VfioGroup::get_group_id(path)?;
933 
934                 // One VFIO container is used for all devices belong to one VFIO group
935                 // NOTE: vfio_wrapper relies on each container containing exactly one group.
936                 IOMMU_CONTAINERS.with(|v| {
937                     if let Some(ref mut containers) = *v.borrow_mut() {
938                         let container = containers
939                             .iter()
940                             .find(|container| container.lock().is_group_set(group_id));
941 
942                         match container {
943                             None => {
944                                 let container = Arc::new(Mutex::new(VfioContainer::new()?));
945                                 containers.push(container.clone());
946                                 Ok(container)
947                             }
948                             Some(container) => Ok(container.clone()),
949                         }
950                     } else {
951                         Err(VfioError::BorrowVfioContainer)
952                     }
953                 })
954             }
955             IommuDevType::CoIommu => {
956                 // One VFIO container is used for devices attached to CoIommu
957                 COIOMMU_CONTAINER.with(|v| {
958                     if v.borrow().is_some() {
959                         if let Some(ref container) = *v.borrow() {
960                             Ok(container.clone())
961                         } else {
962                             Err(VfioError::BorrowVfioContainer)
963                         }
964                     } else {
965                         let container = Arc::new(Mutex::new(VfioContainer::new()?));
966                         *v.borrow_mut() = Some(container.clone());
967                         Ok(container)
968                     }
969                 })
970             }
971             IommuDevType::PkvmPviommu => {
972                 // One VFIO container is used for devices attached to pKVM
973                 PKVM_IOMMU_CONTAINER.with(|v| {
974                     if v.borrow().is_some() {
975                         if let Some(ref container) = *v.borrow() {
976                             Ok(container.clone())
977                         } else {
978                             Err(VfioError::BorrowVfioContainer)
979                         }
980                     } else {
981                         let container = Arc::new(Mutex::new(VfioContainer::new()?));
982                         *v.borrow_mut() = Some(container.clone());
983                         Ok(container)
984                     }
985                 })
986             }
987         }
988     }
989 }
990 
991 /// Vfio Irq type used to enable/disable/mask/unmask vfio irq
992 pub enum VfioIrqType {
993     Intx,
994     Msi,
995     Msix,
996 }
997 
998 /// Vfio Irq information used to assign and enable/disable/mask/unmask vfio irq
999 pub struct VfioIrq {
1000     pub flags: u32,
1001     pub index: u32,
1002 }
1003 
1004 /// Address on VFIO memory region.
1005 #[derive(Debug, Default, Clone)]
1006 pub struct VfioRegionAddr {
1007     /// region number.
1008     pub index: usize,
1009     /// offset in the region.
1010     pub addr: u64,
1011 }
1012 
1013 #[derive(Debug)]
1014 pub struct VfioRegion {
1015     // flags for this region: read/write/mmap
1016     flags: u32,
1017     size: u64,
1018     // region offset used to read/write with vfio device descriptor
1019     offset: u64,
1020     // vectors for mmap offset and size
1021     mmaps: Vec<vfio_region_sparse_mmap_area>,
1022     // type and subtype for cap type
1023     cap_info: Option<(u32, u32)>,
1024 }
1025 
1026 /// Vfio device for exposing regions which could be read/write to kernel vfio device.
1027 pub struct VfioDevice {
1028     dev: File,
1029     name: String,
1030     container: Arc<Mutex<VfioContainer>>,
1031     dev_type: VfioDeviceType,
1032     group_descriptor: RawDescriptor,
1033     group_id: u32,
1034     // vec for vfio device's regions
1035     regions: Vec<VfioRegion>,
1036     num_irqs: u32,
1037 
1038     iova_alloc: Arc<Mutex<AddressAllocator>>,
1039     dt_symbol: Option<String>,
1040     pviommu: Option<(Arc<Mutex<KvmVfioPviommu>>, Vec<u32>)>,
1041 }
1042 
1043 impl VfioDevice {
1044     /// Create a new vfio device, then guest read/write on this device could be
1045     /// transfered into kernel vfio.
1046     /// sysfspath specify the vfio device path in sys file system.
new_passthrough<P: AsRef<Path>>( sysfspath: &P, vm: &impl Vm, container: Arc<Mutex<VfioContainer>>, iommu_dev: IommuDevType, dt_symbol: Option<String>, ) -> Result<Self>1047     pub fn new_passthrough<P: AsRef<Path>>(
1048         sysfspath: &P,
1049         vm: &impl Vm,
1050         container: Arc<Mutex<VfioContainer>>,
1051         iommu_dev: IommuDevType,
1052         dt_symbol: Option<String>,
1053     ) -> Result<Self> {
1054         let group_id = VfioGroup::get_group_id(sysfspath)?;
1055 
1056         let group = container
1057             .lock()
1058             .get_group_with_vm(group_id, vm, iommu_dev)?;
1059         let name_osstr = sysfspath
1060             .as_ref()
1061             .file_name()
1062             .ok_or(VfioError::InvalidPath)?;
1063         let name_str = name_osstr.to_str().ok_or(VfioError::InvalidPath)?;
1064         let name = String::from(name_str);
1065         let dev = group.lock().get_device(&name)?;
1066         let (dev_info, dev_type) = Self::get_device_info(&dev)?;
1067         let regions = Self::get_regions(&dev, dev_info.num_regions)?;
1068         group.lock().add_device_num();
1069         let group_descriptor = group.lock().as_raw_descriptor();
1070 
1071         let iova_ranges = container.lock().vfio_iommu_iova_get_iova_ranges()?;
1072         let iova_alloc = AddressAllocator::new_from_list(iova_ranges, None, None)
1073             .map_err(VfioError::Resources)?;
1074 
1075         let pviommu = if matches!(iommu_dev, IommuDevType::PkvmPviommu) {
1076             // We currently have a 1-to-1 mapping between pvIOMMUs and VFIO devices.
1077             let pviommu = KvmVfioPviommu::new(vm)?;
1078 
1079             let vsids_len = KvmVfioPviommu::get_sid_count(vm, &dev)?.try_into().unwrap();
1080             let max_vsid = u32::MAX.try_into().unwrap();
1081             let random_vsids = sample(&mut thread_rng(), max_vsid, vsids_len).into_iter();
1082             let vsids = Vec::from_iter(random_vsids.map(|v| u32::try_from(v).unwrap()));
1083             for (i, vsid) in vsids.iter().enumerate() {
1084                 pviommu.attach(&dev, i.try_into().unwrap(), *vsid)?;
1085             }
1086 
1087             Some((Arc::new(Mutex::new(pviommu)), vsids))
1088         } else {
1089             None
1090         };
1091 
1092         Ok(VfioDevice {
1093             dev,
1094             name,
1095             container,
1096             dev_type,
1097             group_descriptor,
1098             group_id,
1099             regions,
1100             num_irqs: dev_info.num_irqs,
1101             iova_alloc: Arc::new(Mutex::new(iova_alloc)),
1102             dt_symbol,
1103             pviommu,
1104         })
1105     }
1106 
new<P: AsRef<Path>>( sysfspath: &P, container: Arc<Mutex<VfioContainer>>, ) -> Result<Self>1107     pub fn new<P: AsRef<Path>>(
1108         sysfspath: &P,
1109         container: Arc<Mutex<VfioContainer>>,
1110     ) -> Result<Self> {
1111         let group_id = VfioGroup::get_group_id(sysfspath)?;
1112         let group = container.lock().get_group(group_id)?;
1113         let name_osstr = sysfspath
1114             .as_ref()
1115             .file_name()
1116             .ok_or(VfioError::InvalidPath)?;
1117         let name_str = name_osstr.to_str().ok_or(VfioError::InvalidPath)?;
1118         let name = String::from(name_str);
1119 
1120         let dev = match group.lock().get_device(&name) {
1121             Ok(dev) => dev,
1122             Err(e) => {
1123                 container.lock().remove_group(group_id, false);
1124                 return Err(e);
1125             }
1126         };
1127         let (dev_info, dev_type) = match Self::get_device_info(&dev) {
1128             Ok(dev_info) => dev_info,
1129             Err(e) => {
1130                 container.lock().remove_group(group_id, false);
1131                 return Err(e);
1132             }
1133         };
1134         let regions = match Self::get_regions(&dev, dev_info.num_regions) {
1135             Ok(regions) => regions,
1136             Err(e) => {
1137                 container.lock().remove_group(group_id, false);
1138                 return Err(e);
1139             }
1140         };
1141         group.lock().add_device_num();
1142         let group_descriptor = group.lock().as_raw_descriptor();
1143 
1144         let iova_ranges = container.lock().vfio_iommu_iova_get_iova_ranges()?;
1145         let iova_alloc = AddressAllocator::new_from_list(iova_ranges, None, None)
1146             .map_err(VfioError::Resources)?;
1147 
1148         Ok(VfioDevice {
1149             dev,
1150             name,
1151             container,
1152             dev_type,
1153             group_descriptor,
1154             group_id,
1155             regions,
1156             num_irqs: dev_info.num_irqs,
1157             iova_alloc: Arc::new(Mutex::new(iova_alloc)),
1158             dt_symbol: None,
1159             pviommu: None,
1160         })
1161     }
1162 
1163     /// Returns the file for this device.
dev_file(&self) -> &File1164     pub fn dev_file(&self) -> &File {
1165         &self.dev
1166     }
1167 
1168     /// Returns PCI device name, formatted as BUS:DEVICE.FUNCTION string.
device_name(&self) -> &String1169     pub fn device_name(&self) -> &String {
1170         &self.name
1171     }
1172 
1173     /// Returns the type of this VFIO device.
device_type(&self) -> VfioDeviceType1174     pub fn device_type(&self) -> VfioDeviceType {
1175         self.dev_type
1176     }
1177 
1178     /// Returns the DT symbol (node label) of this VFIO device.
dt_symbol(&self) -> Option<&str>1179     pub fn dt_symbol(&self) -> Option<&str> {
1180         self.dt_symbol.as_deref()
1181     }
1182 
1183     /// Returns the type and indentifier (if applicable) of the IOMMU used by this VFIO device and
1184     /// its master IDs.
iommu(&self) -> Option<(IommuDevType, Option<u32>, &[u32])>1185     pub fn iommu(&self) -> Option<(IommuDevType, Option<u32>, &[u32])> {
1186         // We currently only report IommuDevType::PkvmPviommu.
1187         if let Some((ref pviommu, ref ids)) = self.pviommu {
1188             Some((
1189                 IommuDevType::PkvmPviommu,
1190                 Some(pviommu.lock().id()),
1191                 ids.as_ref(),
1192             ))
1193         } else {
1194             None
1195         }
1196     }
1197 
1198     /// enter the device's low power state
pm_low_power_enter(&self) -> Result<()>1199     pub fn pm_low_power_enter(&self) -> Result<()> {
1200         let mut device_feature = vec_with_array_field::<vfio_device_feature, u8>(0);
1201         device_feature[0].argsz = mem::size_of::<vfio_device_feature>() as u32;
1202         device_feature[0].flags = VFIO_DEVICE_FEATURE_SET | VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY;
1203         // SAFETY:
1204         // Safe as we are the owner of self and power_management which are valid value
1205         let ret = unsafe { ioctl_with_ref(&self.dev, VFIO_DEVICE_FEATURE(), &device_feature[0]) };
1206         if ret < 0 {
1207             Err(VfioError::VfioPmLowPowerEnter(get_error()))
1208         } else {
1209             Ok(())
1210         }
1211     }
1212 
1213     /// enter the device's low power state with wakeup notification
pm_low_power_enter_with_wakeup(&self, wakeup_evt: Event) -> Result<()>1214     pub fn pm_low_power_enter_with_wakeup(&self, wakeup_evt: Event) -> Result<()> {
1215         let payload = vfio_device_low_power_entry_with_wakeup {
1216             wakeup_eventfd: wakeup_evt.as_raw_descriptor(),
1217             reserved: 0,
1218         };
1219         let payload_size = mem::size_of::<vfio_device_low_power_entry_with_wakeup>();
1220         let mut device_feature = vec_with_array_field::<vfio_device_feature, u8>(payload_size);
1221         device_feature[0].argsz = (mem::size_of::<vfio_device_feature>() + payload_size) as u32;
1222         device_feature[0].flags =
1223             VFIO_DEVICE_FEATURE_SET | VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY_WITH_WAKEUP;
1224         // SAFETY:
1225         // Safe as we know vfio_device_low_power_entry_with_wakeup has two 32-bit int fields
1226         unsafe {
1227             device_feature[0]
1228                 .data
1229                 .as_mut_slice(payload_size)
1230                 .copy_from_slice(
1231                     mem::transmute::<vfio_device_low_power_entry_with_wakeup, [u8; 8]>(payload)
1232                         .as_slice(),
1233                 );
1234         }
1235         // SAFETY:
1236         // Safe as we are the owner of self and power_management which are valid value
1237         let ret = unsafe { ioctl_with_ref(&self.dev, VFIO_DEVICE_FEATURE(), &device_feature[0]) };
1238         if ret < 0 {
1239             Err(VfioError::VfioPmLowPowerEnter(get_error()))
1240         } else {
1241             Ok(())
1242         }
1243     }
1244 
1245     /// exit the device's low power state
pm_low_power_exit(&self) -> Result<()>1246     pub fn pm_low_power_exit(&self) -> Result<()> {
1247         let mut device_feature = vec_with_array_field::<vfio_device_feature, u8>(0);
1248         device_feature[0].argsz = mem::size_of::<vfio_device_feature>() as u32;
1249         device_feature[0].flags = VFIO_DEVICE_FEATURE_SET | VFIO_DEVICE_FEATURE_LOW_POWER_EXIT;
1250         // SAFETY:
1251         // Safe as we are the owner of self and power_management which are valid value
1252         let ret = unsafe { ioctl_with_ref(&self.dev, VFIO_DEVICE_FEATURE(), &device_feature[0]) };
1253         if ret < 0 {
1254             Err(VfioError::VfioPmLowPowerExit(get_error()))
1255         } else {
1256             Ok(())
1257         }
1258     }
1259 
1260     /// call _DSM from the device's ACPI table
acpi_dsm(&self, args: &[u8]) -> Result<Vec<u8>>1261     pub fn acpi_dsm(&self, args: &[u8]) -> Result<Vec<u8>> {
1262         let count = args.len();
1263         let mut dsm = vec_with_array_field::<vfio_acpi_dsm, u8>(count);
1264         dsm[0].argsz = (mem::size_of::<vfio_acpi_dsm>() + mem::size_of_val(args)) as u32;
1265         dsm[0].padding = 0;
1266         // SAFETY:
1267         // Safe as we allocated enough space to hold args
1268         unsafe {
1269             dsm[0].args.as_mut_slice(count).clone_from_slice(args);
1270         }
1271         // SAFETY:
1272         // Safe as we are the owner of self and dsm which are valid value
1273         let ret = unsafe { ioctl_with_mut_ref(&self.dev, VFIO_DEVICE_ACPI_DSM(), &mut dsm[0]) };
1274         if ret < 0 {
1275             Err(VfioError::VfioAcpiDsm(get_error()))
1276         } else {
1277             // SAFETY:
1278             // Safe as we allocated enough space to hold args
1279             let res = unsafe { dsm[0].args.as_slice(count) };
1280             Ok(res.to_vec())
1281         }
1282     }
1283 
1284     /// Enable vfio device's ACPI notifications and associate EventFD with device.
acpi_notification_evt_enable( &self, acpi_notification_eventfd: &Event, index: u32, ) -> Result<()>1285     pub fn acpi_notification_evt_enable(
1286         &self,
1287         acpi_notification_eventfd: &Event,
1288         index: u32,
1289     ) -> Result<()> {
1290         let u32_size = mem::size_of::<u32>();
1291         let count = 1;
1292 
1293         let mut irq_set = vec_with_array_field::<vfio_irq_set, u32>(count);
1294         irq_set[0].argsz = (mem::size_of::<vfio_irq_set>() + count * u32_size) as u32;
1295         irq_set[0].flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER;
1296         irq_set[0].index = index;
1297         irq_set[0].start = 0;
1298         irq_set[0].count = count as u32;
1299 
1300         // SAFETY:
1301         // It is safe as enough space is reserved through vec_with_array_field(u32)<count>.
1302         let data = unsafe { irq_set[0].data.as_mut_slice(count * u32_size) };
1303         data.copy_from_slice(&acpi_notification_eventfd.as_raw_descriptor().to_ne_bytes()[..]);
1304 
1305         // SAFETY:
1306         // Safe as we are the owner of self and irq_set which are valid value
1307         let ret = unsafe { ioctl_with_ref(&self.dev, VFIO_DEVICE_SET_IRQS(), &irq_set[0]) };
1308         if ret < 0 {
1309             Err(VfioError::VfioAcpiNotificationEnable(get_error()))
1310         } else {
1311             Ok(())
1312         }
1313     }
1314 
1315     /// Disable vfio device's ACPI notification and disconnect EventFd with device.
acpi_notification_disable(&self, index: u32) -> Result<()>1316     pub fn acpi_notification_disable(&self, index: u32) -> Result<()> {
1317         let mut irq_set = vec_with_array_field::<vfio_irq_set, u32>(0);
1318         irq_set[0].argsz = mem::size_of::<vfio_irq_set>() as u32;
1319         irq_set[0].flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
1320         irq_set[0].index = index;
1321         irq_set[0].start = 0;
1322         irq_set[0].count = 0;
1323 
1324         // SAFETY:
1325         // Safe as we are the owner of self and irq_set which are valid value
1326         let ret = unsafe { ioctl_with_ref(&self.dev, VFIO_DEVICE_SET_IRQS(), &irq_set[0]) };
1327         if ret < 0 {
1328             Err(VfioError::VfioAcpiNotificationDisable(get_error()))
1329         } else {
1330             Ok(())
1331         }
1332     }
1333 
1334     /// Test vfio device's ACPI notification by simulating hardware triggering.
1335     /// When the signaling mechanism is set, the VFIO_IRQ_SET_DATA_BOOL can be used with
1336     /// VFIO_IRQ_SET_ACTION_TRIGGER to perform kernel level interrupt loopback testing.
acpi_notification_test(&self, index: u32, val: u32) -> Result<()>1337     pub fn acpi_notification_test(&self, index: u32, val: u32) -> Result<()> {
1338         let u32_size = mem::size_of::<u32>();
1339         let mut irq_set = vec_with_array_field::<vfio_irq_set, u32>(1);
1340         irq_set[0].argsz = (mem::size_of::<vfio_irq_set>() + u32_size) as u32;
1341         irq_set[0].flags = VFIO_IRQ_SET_DATA_BOOL | VFIO_IRQ_SET_ACTION_TRIGGER;
1342         irq_set[0].index = index;
1343         irq_set[0].start = 0;
1344         irq_set[0].count = 1;
1345 
1346         // SAFETY:
1347         // It is safe as enough space is reserved through vec_with_array_field(u32)<count>.
1348         let data = unsafe { irq_set[0].data.as_mut_slice(u32_size) };
1349         data.copy_from_slice(&val.to_ne_bytes()[..]);
1350 
1351         // SAFETY:
1352         // Safe as we are the owner of self and irq_set which are valid value
1353         let ret = unsafe { ioctl_with_ref(&self.dev, VFIO_DEVICE_SET_IRQS(), &irq_set[0]) };
1354         if ret < 0 {
1355             Err(VfioError::VfioAcpiNotificationTest(get_error()))
1356         } else {
1357             Ok(())
1358         }
1359     }
1360 
1361     /// Enable vfio device's irq and associate Irqfd Event with device.
1362     /// When MSIx is enabled, multi vectors will be supported, and vectors starting from subindex to
1363     /// subindex + descriptors length will be assigned with irqfd in the descriptors array.
1364     /// when index = VFIO_PCI_REQ_IRQ_INDEX, kernel vfio will trigger this event when physical
1365     /// device is removed.
1366     /// If descriptor is None, -1 is assigned to the irq. A value of -1 is used to either de-assign
1367     /// interrupts if already assigned or skip un-assigned interrupts.
irq_enable( &self, descriptors: &[Option<&Event>], index: u32, subindex: u32, ) -> Result<()>1368     pub fn irq_enable(
1369         &self,
1370         descriptors: &[Option<&Event>],
1371         index: u32,
1372         subindex: u32,
1373     ) -> Result<()> {
1374         let count = descriptors.len();
1375         let u32_size = mem::size_of::<u32>();
1376         let mut irq_set = vec_with_array_field::<vfio_irq_set, u32>(count);
1377         irq_set[0].argsz = (mem::size_of::<vfio_irq_set>() + count * u32_size) as u32;
1378         irq_set[0].flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER;
1379         irq_set[0].index = index;
1380         irq_set[0].start = subindex;
1381         irq_set[0].count = count as u32;
1382 
1383         // SAFETY:
1384         // irq_set.data could be none, bool or descriptor according to flags, so irq_set.data
1385         // is u8 default, here irq_set.data is descriptor as u32, so 4 default u8 are combined
1386         // together as u32. It is safe as enough space is reserved through
1387         // vec_with_array_field(u32)<count>.
1388         let mut data = unsafe { irq_set[0].data.as_mut_slice(count * u32_size) };
1389         for descriptor in descriptors.iter().take(count) {
1390             let (left, right) = data.split_at_mut(u32_size);
1391             match descriptor {
1392                 Some(fd) => left.copy_from_slice(&fd.as_raw_descriptor().to_ne_bytes()[..]),
1393                 None => left.copy_from_slice(&(-1i32).to_ne_bytes()[..]),
1394             }
1395             data = right;
1396         }
1397 
1398         // SAFETY:
1399         // Safe as we are the owner of self and irq_set which are valid value
1400         let ret = unsafe { ioctl_with_ref(&self.dev, VFIO_DEVICE_SET_IRQS(), &irq_set[0]) };
1401         if ret < 0 {
1402             Err(VfioError::VfioIrqEnable(get_error()))
1403         } else {
1404             Ok(())
1405         }
1406     }
1407 
1408     /// When intx is enabled, irqfd is used to trigger a level interrupt into guest, resample irqfd
1409     /// is used to get guest EOI notification.
1410     /// When host hw generates interrupt, vfio irq handler in host kernel receive and handle it,
1411     /// this handler disable hw irq first, then trigger irqfd to inject interrupt into guest. When
1412     /// resample irqfd is triggered by guest EOI, vfio kernel could enable hw irq, so hw could
1413     /// generate another interrupts.
1414     /// This function enable resample irqfd and let vfio kernel could get EOI notification.
1415     ///
1416     /// descriptor: should be resample IrqFd.
resample_virq_enable(&self, descriptor: &Event, index: u32) -> Result<()>1417     pub fn resample_virq_enable(&self, descriptor: &Event, index: u32) -> Result<()> {
1418         let mut irq_set = vec_with_array_field::<vfio_irq_set, u32>(1);
1419         irq_set[0].argsz = (mem::size_of::<vfio_irq_set>() + mem::size_of::<u32>()) as u32;
1420         irq_set[0].flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_UNMASK;
1421         irq_set[0].index = index;
1422         irq_set[0].start = 0;
1423         irq_set[0].count = 1;
1424 
1425         {
1426             // SAFETY:
1427             // irq_set.data could be none, bool or descriptor according to flags, so irq_set.data is
1428             // u8 default, here irq_set.data is descriptor as u32, so 4 default u8 are combined
1429             // together as u32. It is safe as enough space is reserved through
1430             // vec_with_array_field(u32)<1>.
1431             let descriptors = unsafe { irq_set[0].data.as_mut_slice(4) };
1432             descriptors.copy_from_slice(&descriptor.as_raw_descriptor().to_le_bytes()[..]);
1433         }
1434 
1435         // SAFETY:
1436         // Safe as we are the owner of self and irq_set which are valid value
1437         let ret = unsafe { ioctl_with_ref(&self.dev, VFIO_DEVICE_SET_IRQS(), &irq_set[0]) };
1438         if ret < 0 {
1439             Err(VfioError::VfioIrqEnable(get_error()))
1440         } else {
1441             Ok(())
1442         }
1443     }
1444 
1445     /// disable vfio device's irq and disconnect Irqfd Event with device
irq_disable(&self, index: u32) -> Result<()>1446     pub fn irq_disable(&self, index: u32) -> Result<()> {
1447         let mut irq_set = vec_with_array_field::<vfio_irq_set, u32>(0);
1448         irq_set[0].argsz = mem::size_of::<vfio_irq_set>() as u32;
1449         irq_set[0].flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
1450         irq_set[0].index = index;
1451         irq_set[0].start = 0;
1452         irq_set[0].count = 0;
1453 
1454         // SAFETY:
1455         // Safe as we are the owner of self and irq_set which are valid value
1456         let ret = unsafe { ioctl_with_ref(&self.dev, VFIO_DEVICE_SET_IRQS(), &irq_set[0]) };
1457         if ret < 0 {
1458             Err(VfioError::VfioIrqDisable(get_error()))
1459         } else {
1460             Ok(())
1461         }
1462     }
1463 
1464     /// Unmask vfio device irq
irq_unmask(&self, index: u32) -> Result<()>1465     pub fn irq_unmask(&self, index: u32) -> Result<()> {
1466         let mut irq_set = vec_with_array_field::<vfio_irq_set, u32>(0);
1467         irq_set[0].argsz = mem::size_of::<vfio_irq_set>() as u32;
1468         irq_set[0].flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_UNMASK;
1469         irq_set[0].index = index;
1470         irq_set[0].start = 0;
1471         irq_set[0].count = 1;
1472 
1473         // SAFETY:
1474         // Safe as we are the owner of self and irq_set which are valid value
1475         let ret = unsafe { ioctl_with_ref(&self.dev, VFIO_DEVICE_SET_IRQS(), &irq_set[0]) };
1476         if ret < 0 {
1477             Err(VfioError::VfioIrqUnmask(get_error()))
1478         } else {
1479             Ok(())
1480         }
1481     }
1482 
1483     /// Mask vfio device irq
irq_mask(&self, index: u32) -> Result<()>1484     pub fn irq_mask(&self, index: u32) -> Result<()> {
1485         let mut irq_set = vec_with_array_field::<vfio_irq_set, u32>(0);
1486         irq_set[0].argsz = mem::size_of::<vfio_irq_set>() as u32;
1487         irq_set[0].flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_MASK;
1488         irq_set[0].index = index;
1489         irq_set[0].start = 0;
1490         irq_set[0].count = 1;
1491 
1492         // SAFETY:
1493         // Safe as we are the owner of self and irq_set which are valid value
1494         let ret = unsafe { ioctl_with_ref(&self.dev, VFIO_DEVICE_SET_IRQS(), &irq_set[0]) };
1495         if ret < 0 {
1496             Err(VfioError::VfioIrqMask(get_error()))
1497         } else {
1498             Ok(())
1499         }
1500     }
1501 
1502     /// Get and validate VFIO device information.
get_device_info(device_file: &File) -> Result<(vfio_device_info, VfioDeviceType)>1503     fn get_device_info(device_file: &File) -> Result<(vfio_device_info, VfioDeviceType)> {
1504         let mut dev_info = vfio_device_info {
1505             argsz: mem::size_of::<vfio_device_info>() as u32,
1506             flags: 0,
1507             num_regions: 0,
1508             num_irqs: 0,
1509             ..Default::default()
1510         };
1511 
1512         // SAFETY:
1513         // Safe as we are the owner of device_file and dev_info which are valid value,
1514         // and we verify the return value.
1515         let ret = unsafe { ioctl_with_mut_ref(device_file, VFIO_DEVICE_GET_INFO(), &mut dev_info) };
1516         if ret < 0 {
1517             return Err(VfioError::VfioDeviceGetInfo(get_error()));
1518         }
1519 
1520         let dev_type = if (dev_info.flags & VFIO_DEVICE_FLAGS_PCI) != 0 {
1521             if dev_info.num_regions < VFIO_PCI_CONFIG_REGION_INDEX + 1
1522                 || dev_info.num_irqs < VFIO_PCI_MSIX_IRQ_INDEX + 1
1523             {
1524                 return Err(VfioError::VfioDeviceGetInfo(get_error()));
1525             }
1526 
1527             VfioDeviceType::Pci
1528         } else if (dev_info.flags & VFIO_DEVICE_FLAGS_PLATFORM) != 0 {
1529             VfioDeviceType::Platform
1530         } else {
1531             return Err(VfioError::UnknownDeviceType(dev_info.flags));
1532         };
1533 
1534         Ok((dev_info, dev_type))
1535     }
1536 
1537     /// Query interrupt information
1538     /// return: Vector of interrupts information, each of which contains flags and index
get_irqs(&self) -> Result<Vec<VfioIrq>>1539     pub fn get_irqs(&self) -> Result<Vec<VfioIrq>> {
1540         let mut irqs: Vec<VfioIrq> = Vec::new();
1541 
1542         for i in 0..self.num_irqs {
1543             let argsz = mem::size_of::<vfio_irq_info>() as u32;
1544             let mut irq_info = vfio_irq_info {
1545                 argsz,
1546                 flags: 0,
1547                 index: i,
1548                 count: 0,
1549             };
1550             // SAFETY:
1551             // Safe as we are the owner of dev and irq_info which are valid value,
1552             // and we verify the return value.
1553             let ret = unsafe {
1554                 ioctl_with_mut_ref(
1555                     self.device_file(),
1556                     VFIO_DEVICE_GET_IRQ_INFO(),
1557                     &mut irq_info,
1558                 )
1559             };
1560             if ret < 0 || irq_info.count != 1 {
1561                 return Err(VfioError::VfioDeviceGetInfo(get_error()));
1562             }
1563 
1564             let irq = VfioIrq {
1565                 flags: irq_info.flags,
1566                 index: irq_info.index,
1567             };
1568             irqs.push(irq);
1569         }
1570         Ok(irqs)
1571     }
1572 
1573     #[allow(clippy::cast_ptr_alignment)]
get_regions(dev: &File, num_regions: u32) -> Result<Vec<VfioRegion>>1574     fn get_regions(dev: &File, num_regions: u32) -> Result<Vec<VfioRegion>> {
1575         let mut regions: Vec<VfioRegion> = Vec::new();
1576         for i in 0..num_regions {
1577             let argsz = mem::size_of::<vfio_region_info>() as u32;
1578             let mut reg_info = vfio_region_info {
1579                 argsz,
1580                 flags: 0,
1581                 index: i,
1582                 cap_offset: 0,
1583                 size: 0,
1584                 offset: 0,
1585             };
1586             let ret =
1587                 // SAFETY:
1588                 // Safe as we are the owner of dev and reg_info which are valid value,
1589                 // and we verify the return value.
1590                 unsafe { ioctl_with_mut_ref(dev, VFIO_DEVICE_GET_REGION_INFO(), &mut reg_info) };
1591             if ret < 0 {
1592                 continue;
1593             }
1594 
1595             let mut mmaps: Vec<vfio_region_sparse_mmap_area> = Vec::new();
1596             let mut cap_info: Option<(u32, u32)> = None;
1597             if reg_info.argsz > argsz {
1598                 let cap_len: usize = (reg_info.argsz - argsz) as usize;
1599                 let mut region_with_cap =
1600                     vec_with_array_field::<vfio_region_info_with_cap, u8>(cap_len);
1601                 region_with_cap[0].region_info.argsz = reg_info.argsz;
1602                 region_with_cap[0].region_info.flags = 0;
1603                 region_with_cap[0].region_info.index = i;
1604                 region_with_cap[0].region_info.cap_offset = 0;
1605                 region_with_cap[0].region_info.size = 0;
1606                 region_with_cap[0].region_info.offset = 0;
1607                 // SAFETY:
1608                 // Safe as we are the owner of dev and region_info which are valid value,
1609                 // and we verify the return value.
1610                 let ret = unsafe {
1611                     ioctl_with_mut_ref(
1612                         dev,
1613                         VFIO_DEVICE_GET_REGION_INFO(),
1614                         &mut (region_with_cap[0].region_info),
1615                     )
1616                 };
1617                 if ret < 0 {
1618                     return Err(VfioError::VfioDeviceGetRegionInfo(get_error()));
1619                 }
1620 
1621                 if region_with_cap[0].region_info.flags & VFIO_REGION_INFO_FLAG_CAPS == 0 {
1622                     continue;
1623                 }
1624 
1625                 let cap_header_sz = mem::size_of::<vfio_info_cap_header>() as u32;
1626                 let mmap_cap_sz = mem::size_of::<vfio_region_info_cap_sparse_mmap>() as u32;
1627                 let mmap_area_sz = mem::size_of::<vfio_region_sparse_mmap_area>() as u32;
1628                 let type_cap_sz = mem::size_of::<vfio_region_info_cap_type>() as u32;
1629                 let region_info_sz = reg_info.argsz;
1630 
1631                 // region_with_cap[0].cap_info may contain many structures, like
1632                 // vfio_region_info_cap_sparse_mmap struct or vfio_region_info_cap_type struct.
1633                 // Both of them begin with vfio_info_cap_header, so we will get individual cap from
1634                 // vfio_into_cap_header.
1635                 // Go through all the cap structs.
1636                 let info_ptr = region_with_cap.as_ptr() as *mut u8;
1637                 let mut offset = region_with_cap[0].region_info.cap_offset;
1638                 while offset != 0 {
1639                     if offset + cap_header_sz > region_info_sz {
1640                         break;
1641                     }
1642                     // SAFETY:
1643                     // Safe, as cap_header struct is in this function allocated region_with_cap
1644                     // vec.
1645                     let cap_ptr = unsafe { info_ptr.offset(offset as isize) };
1646                     // SAFETY:
1647                     // Safe, as cap_header struct is in this function allocated region_with_cap
1648                     // vec.
1649                     let cap_header = unsafe { &*(cap_ptr as *const vfio_info_cap_header) };
1650                     if cap_header.id as u32 == VFIO_REGION_INFO_CAP_SPARSE_MMAP {
1651                         if offset + mmap_cap_sz > region_info_sz {
1652                             break;
1653                         }
1654                         // cap_ptr is vfio_region_info_cap_sparse_mmap here
1655                         let sparse_mmap =
1656                             // SAFETY:
1657                             // Safe, this vfio_region_info_cap_sparse_mmap is in this function
1658                             // allocated region_with_cap vec.
1659                             unsafe { &*(cap_ptr as *const vfio_region_info_cap_sparse_mmap) };
1660 
1661                         let area_num = sparse_mmap.nr_areas;
1662                         if offset + mmap_cap_sz + area_num * mmap_area_sz > region_info_sz {
1663                             break;
1664                         }
1665                         let areas =
1666                             // SAFETY:
1667                             // Safe, these vfio_region_sparse_mmap_area are in this function allocated
1668                             // region_with_cap vec.
1669                             unsafe { sparse_mmap.areas.as_slice(sparse_mmap.nr_areas as usize) };
1670                         for area in areas.iter() {
1671                             mmaps.push(*area);
1672                         }
1673                     } else if cap_header.id as u32 == VFIO_REGION_INFO_CAP_TYPE {
1674                         if offset + type_cap_sz > region_info_sz {
1675                             break;
1676                         }
1677                         // cap_ptr is vfio_region_info_cap_type here
1678                         let cap_type_info =
1679                             // SAFETY:
1680                             // Safe, this vfio_region_info_cap_type is in this function allocated
1681                             // region_with_cap vec
1682                             unsafe { &*(cap_ptr as *const vfio_region_info_cap_type) };
1683 
1684                         cap_info = Some((cap_type_info.type_, cap_type_info.subtype));
1685                     } else if cap_header.id as u32 == VFIO_REGION_INFO_CAP_MSIX_MAPPABLE {
1686                         mmaps.push(vfio_region_sparse_mmap_area {
1687                             offset: 0,
1688                             size: region_with_cap[0].region_info.size,
1689                         });
1690                     }
1691 
1692                     offset = cap_header.next;
1693                 }
1694             } else if reg_info.flags & VFIO_REGION_INFO_FLAG_MMAP != 0 {
1695                 mmaps.push(vfio_region_sparse_mmap_area {
1696                     offset: 0,
1697                     size: reg_info.size,
1698                 });
1699             }
1700 
1701             let region = VfioRegion {
1702                 flags: reg_info.flags,
1703                 size: reg_info.size,
1704                 offset: reg_info.offset,
1705                 mmaps,
1706                 cap_info,
1707             };
1708             regions.push(region);
1709         }
1710 
1711         Ok(regions)
1712     }
1713 
1714     /// get a region's flag
1715     /// the return's value may conatin:
1716     ///     VFIO_REGION_INFO_FLAG_READ:  region supports read
1717     ///     VFIO_REGION_INFO_FLAG_WRITE: region supports write
1718     ///     VFIO_REGION_INFO_FLAG_MMAP:  region supports mmap
1719     ///     VFIO_REGION_INFO_FLAG_CAPS:  region's info supports caps
get_region_flags(&self, index: usize) -> u321720     pub fn get_region_flags(&self, index: usize) -> u32 {
1721         match self.regions.get(index) {
1722             Some(v) => v.flags,
1723             None => {
1724                 warn!("get_region_flags() with invalid index: {}", index);
1725                 0
1726             }
1727         }
1728     }
1729 
1730     /// get a region's offset
1731     /// return: Region offset from the start of vfio device descriptor
get_region_offset(&self, index: usize) -> u641732     pub fn get_region_offset(&self, index: usize) -> u64 {
1733         match self.regions.get(index) {
1734             Some(v) => v.offset,
1735             None => {
1736                 warn!("get_region_offset with invalid index: {}", index);
1737                 0
1738             }
1739         }
1740     }
1741 
1742     /// get a region's size
1743     /// return: Region size from the start of vfio device descriptor
get_region_size(&self, index: usize) -> u641744     pub fn get_region_size(&self, index: usize) -> u64 {
1745         match self.regions.get(index) {
1746             Some(v) => v.size,
1747             None => {
1748                 warn!("get_region_size with invalid index: {}", index);
1749                 0
1750             }
1751         }
1752     }
1753 
1754     /// get a number of regions
1755     /// return: Number of regions of vfio device descriptor
get_region_count(&self) -> usize1756     pub fn get_region_count(&self) -> usize {
1757         self.regions.len()
1758     }
1759 
1760     /// get a region's mmap info vector
get_region_mmap(&self, index: usize) -> Vec<vfio_region_sparse_mmap_area>1761     pub fn get_region_mmap(&self, index: usize) -> Vec<vfio_region_sparse_mmap_area> {
1762         match self.regions.get(index) {
1763             Some(v) => v.mmaps.clone(),
1764             None => {
1765                 warn!("get_region_mmap with invalid index: {}", index);
1766                 Vec::new()
1767             }
1768         }
1769     }
1770 
1771     /// find the specified cap type in device regions
1772     /// Input:
1773     ///      type_:  cap type
1774     ///      sub_type: cap sub_type
1775     /// Output:
1776     ///     None: device doesn't have the specified cap type
1777     ///     Some((bar_index, region_size)): device has the specified cap type, return region's
1778     ///                                     index and size
get_cap_type_info(&self, type_: u32, sub_type: u32) -> Option<(u32, u64)>1779     pub fn get_cap_type_info(&self, type_: u32, sub_type: u32) -> Option<(u32, u64)> {
1780         for (index, region) in self.regions.iter().enumerate() {
1781             if let Some(cap_info) = &region.cap_info {
1782                 if cap_info.0 == type_ && cap_info.1 == sub_type {
1783                     return Some((index as u32, region.size));
1784                 }
1785             }
1786         }
1787 
1788         None
1789     }
1790 
1791     /// Returns file offset corresponding to the given `VfioRegionAddr`.
1792     /// The offset can be used when reading/writing the VFIO device's FD directly.
get_offset_for_addr(&self, addr: &VfioRegionAddr) -> Result<u64>1793     pub fn get_offset_for_addr(&self, addr: &VfioRegionAddr) -> Result<u64> {
1794         let region = self
1795             .regions
1796             .get(addr.index)
1797             .ok_or(VfioError::InvalidIndex(addr.index))?;
1798         Ok(region.offset + addr.addr)
1799     }
1800 
1801     /// Read region's data from VFIO device into buf
1802     /// index: region num
1803     /// buf: data destination and buf length is read size
1804     /// addr: offset in the region
region_read(&self, index: usize, buf: &mut [u8], addr: u64)1805     pub fn region_read(&self, index: usize, buf: &mut [u8], addr: u64) {
1806         let stub: &VfioRegion = self
1807             .regions
1808             .get(index)
1809             .unwrap_or_else(|| panic!("tried to read VFIO with an invalid index: {}", index));
1810 
1811         let size = buf.len() as u64;
1812         if size > stub.size || addr + size > stub.size {
1813             panic!(
1814                 "tried to read VFIO region with invalid arguments: index={}, addr=0x{:x}, size=0x{:x}",
1815                 index, addr, size
1816             );
1817         }
1818 
1819         self.dev
1820             .read_exact_at(buf, stub.offset + addr)
1821             .unwrap_or_else(|e| {
1822                 panic!(
1823                     "failed to read region: index={}, addr=0x{:x}, error={}",
1824                     index, addr, e
1825                 )
1826             });
1827     }
1828 
1829     /// Reads a value from the specified `VfioRegionAddr.addr` + `offset`.
region_read_from_addr<T: FromBytes>(&self, addr: &VfioRegionAddr, offset: u64) -> T1830     pub fn region_read_from_addr<T: FromBytes>(&self, addr: &VfioRegionAddr, offset: u64) -> T {
1831         let mut val = mem::MaybeUninit::zeroed();
1832         let buf =
1833             // SAFETY:
1834             // Safe because we have zero-initialized `size_of::<T>()` bytes.
1835             unsafe { slice::from_raw_parts_mut(val.as_mut_ptr() as *mut u8, mem::size_of::<T>()) };
1836         self.region_read(addr.index, buf, addr.addr + offset);
1837         // SAFETY:
1838         // Safe because any bit pattern is valid for a type that implements FromBytes.
1839         unsafe { val.assume_init() }
1840     }
1841 
1842     /// write the data from buf into a vfio device region
1843     /// index: region num
1844     /// buf: data src and buf length is write size
1845     /// addr: offset in the region
region_write(&self, index: usize, buf: &[u8], addr: u64)1846     pub fn region_write(&self, index: usize, buf: &[u8], addr: u64) {
1847         let stub: &VfioRegion = self
1848             .regions
1849             .get(index)
1850             .unwrap_or_else(|| panic!("tried to write VFIO with an invalid index: {}", index));
1851 
1852         let size = buf.len() as u64;
1853         if size > stub.size
1854             || addr + size > stub.size
1855             || (stub.flags & VFIO_REGION_INFO_FLAG_WRITE) == 0
1856         {
1857             panic!(
1858                 "tried to write VFIO region with invalid arguments: index={}, addr=0x{:x}, size=0x{:x}",
1859                 index, addr, size
1860             );
1861         }
1862 
1863         self.dev
1864             .write_all_at(buf, stub.offset + addr)
1865             .unwrap_or_else(|e| {
1866                 panic!(
1867                     "failed to write region: index={}, addr=0x{:x}, error={}",
1868                     index, addr, e
1869                 )
1870             });
1871     }
1872 
1873     /// Writes data into the specified `VfioRegionAddr.addr` + `offset`.
region_write_to_addr<T: AsBytes>(&self, val: &T, addr: &VfioRegionAddr, offset: u64)1874     pub fn region_write_to_addr<T: AsBytes>(&self, val: &T, addr: &VfioRegionAddr, offset: u64) {
1875         self.region_write(addr.index, val.as_bytes(), addr.addr + offset);
1876     }
1877 
1878     /// get vfio device's descriptors which are passed into minijail process
keep_rds(&self) -> Vec<RawDescriptor>1879     pub fn keep_rds(&self) -> Vec<RawDescriptor> {
1880         vec![
1881             self.dev.as_raw_descriptor(),
1882             self.group_descriptor,
1883             self.container.lock().as_raw_descriptor(),
1884         ]
1885     }
1886 
1887     /// Add (iova, user_addr) map into vfio container iommu table
1888     /// # Safety
1889     ///
1890     /// The caller is responsible for determining the safety of the VFIO_IOMMU_MAP_DMA ioctl.
vfio_dma_map( &self, iova: u64, size: u64, user_addr: u64, write_en: bool, ) -> Result<()>1891     pub unsafe fn vfio_dma_map(
1892         &self,
1893         iova: u64,
1894         size: u64,
1895         user_addr: u64,
1896         write_en: bool,
1897     ) -> Result<()> {
1898         self.container
1899             .lock()
1900             .vfio_dma_map(iova, size, user_addr, write_en)
1901     }
1902 
1903     /// Remove (iova, user_addr) map from vfio container iommu table
vfio_dma_unmap(&self, iova: u64, size: u64) -> Result<()>1904     pub fn vfio_dma_unmap(&self, iova: u64, size: u64) -> Result<()> {
1905         self.container.lock().vfio_dma_unmap(iova, size)
1906     }
1907 
vfio_get_iommu_page_size_mask(&self) -> Result<u64>1908     pub fn vfio_get_iommu_page_size_mask(&self) -> Result<u64> {
1909         self.container.lock().vfio_get_iommu_page_size_mask()
1910     }
1911 
alloc_iova(&self, size: u64, align_size: u64, alloc: Alloc) -> Result<u64>1912     pub fn alloc_iova(&self, size: u64, align_size: u64, alloc: Alloc) -> Result<u64> {
1913         self.iova_alloc
1914             .lock()
1915             .allocate_with_align(size, alloc, "alloc_iova".to_owned(), align_size)
1916             .map_err(VfioError::Resources)
1917     }
1918 
get_iova(&self, alloc: &Alloc) -> Option<AddressRange>1919     pub fn get_iova(&self, alloc: &Alloc) -> Option<AddressRange> {
1920         self.iova_alloc.lock().get(alloc).map(|res| res.0)
1921     }
1922 
release_iova(&self, alloc: Alloc) -> Result<AddressRange>1923     pub fn release_iova(&self, alloc: Alloc) -> Result<AddressRange> {
1924         self.iova_alloc
1925             .lock()
1926             .release(alloc)
1927             .map_err(VfioError::Resources)
1928     }
1929 
get_max_addr(&self) -> u641930     pub fn get_max_addr(&self) -> u64 {
1931         self.iova_alloc.lock().get_max_addr()
1932     }
1933 
1934     /// Gets the vfio device backing `File`.
device_file(&self) -> &File1935     pub fn device_file(&self) -> &File {
1936         &self.dev
1937     }
1938 
1939     /// close vfio device
close(&self)1940     pub fn close(&self) {
1941         self.container.lock().remove_group(self.group_id, true);
1942     }
1943 }
1944 
1945 pub struct VfioPciConfig {
1946     device: Arc<VfioDevice>,
1947 }
1948 
1949 impl VfioPciConfig {
new(device: Arc<VfioDevice>) -> Self1950     pub fn new(device: Arc<VfioDevice>) -> Self {
1951         VfioPciConfig { device }
1952     }
1953 
read_config<T: FromBytes>(&self, offset: u32) -> T1954     pub fn read_config<T: FromBytes>(&self, offset: u32) -> T {
1955         let mut buf = vec![0u8; std::mem::size_of::<T>()];
1956         self.device.region_read(
1957             VFIO_PCI_CONFIG_REGION_INDEX as usize,
1958             &mut buf,
1959             offset.into(),
1960         );
1961         T::read_from(&buf[..]).expect("failed to convert config data from slice")
1962     }
1963 
write_config<T: AsBytes>(&self, config: T, offset: u32)1964     pub fn write_config<T: AsBytes>(&self, config: T, offset: u32) {
1965         self.device.region_write(
1966             VFIO_PCI_CONFIG_REGION_INDEX as usize,
1967             config.as_bytes(),
1968             offset.into(),
1969         );
1970     }
1971 
1972     /// Set the VFIO device this config refers to as the bus master.
set_bus_master(&self)1973     pub fn set_bus_master(&self) {
1974         /// Constant definitions from `linux/pci_regs.h`.
1975         const PCI_COMMAND: u32 = 0x4;
1976         /// Enable bus mastering
1977         const PCI_COMMAND_MASTER: u16 = 0x4;
1978 
1979         let mut cmd: u16 = self.read_config(PCI_COMMAND);
1980 
1981         if cmd & PCI_COMMAND_MASTER != 0 {
1982             return;
1983         }
1984 
1985         cmd |= PCI_COMMAND_MASTER;
1986 
1987         self.write_config(cmd, PCI_COMMAND);
1988     }
1989 }
1990 
1991 impl AsRawDescriptor for VfioDevice {
as_raw_descriptor(&self) -> RawDescriptor1992     fn as_raw_descriptor(&self) -> RawDescriptor {
1993         self.dev.as_raw_descriptor()
1994     }
1995 }
1996