• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2019 The ChromiumOS Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 use std::collections::HashMap;
6 use std::ffi::CString;
7 use std::fs::File;
8 use std::fs::OpenOptions;
9 use std::io;
10 use std::mem;
11 use std::os::raw::c_ulong;
12 use std::os::unix::prelude::FileExt;
13 use std::path::Path;
14 use std::path::PathBuf;
15 #[cfg(all(target_os = "android", target_arch = "aarch64"))]
16 use std::ptr::addr_of_mut;
17 use std::slice;
18 use std::sync::Arc;
19 
20 use base::error;
21 use base::ioctl;
22 use base::ioctl_with_mut_ptr;
23 use base::ioctl_with_mut_ref;
24 use base::ioctl_with_ptr;
25 use base::ioctl_with_ref;
26 use base::ioctl_with_val;
27 use base::warn;
28 use base::AsRawDescriptor;
29 use base::Error;
30 use base::Event;
31 use base::FromRawDescriptor;
32 use base::RawDescriptor;
33 use base::SafeDescriptor;
34 use cfg_if::cfg_if;
35 use data_model::vec_with_array_field;
36 use hypervisor::DeviceKind;
37 use hypervisor::Vm;
38 use once_cell::sync::OnceCell;
39 use rand::seq::index::sample;
40 use rand::thread_rng;
41 use remain::sorted;
42 use resources::address_allocator::AddressAllocator;
43 use resources::AddressRange;
44 use resources::Alloc;
45 use resources::Error as ResourcesError;
46 use sync::Mutex;
47 use thiserror::Error;
48 use vfio_sys::vfio::vfio_acpi_dsm;
49 use vfio_sys::vfio::VFIO_IRQ_SET_DATA_BOOL;
50 use vfio_sys::*;
51 use zerocopy::FromBytes;
52 use zerocopy::Immutable;
53 use zerocopy::IntoBytes;
54 
55 use crate::IommuDevType;
56 
57 #[sorted]
58 #[derive(Error, Debug)]
59 pub enum VfioError {
60     #[error("failed to duplicate VfioContainer")]
61     ContainerDupError,
62     #[error("failed to set container's IOMMU driver type as {0:?}: {1}")]
63     ContainerSetIOMMU(IommuType, Error),
64     #[error("failed to create KVM vfio device: {0}")]
65     CreateVfioKvmDevice(Error),
66     #[error("failed to get Group Status: {0}")]
67     GetGroupStatus(Error),
68     #[error("failed to get vfio device fd: {0}")]
69     GroupGetDeviceFD(Error),
70     #[error("failed to add vfio group into vfio container: {0}")]
71     GroupSetContainer(Error),
72     #[error("group is inviable")]
73     GroupViable,
74     #[error("invalid region index: {0}")]
75     InvalidIndex(usize),
76     #[error("invalid operation")]
77     InvalidOperation,
78     #[error("invalid file path")]
79     InvalidPath,
80     #[error("failed to add guest memory map into iommu table: {0}")]
81     IommuDmaMap(Error),
82     #[error("failed to remove guest memory map from iommu table: {0}")]
83     IommuDmaUnmap(Error),
84     #[error("failed to get IOMMU cap info from host")]
85     IommuGetCapInfo,
86     #[error("failed to get IOMMU info from host: {0}")]
87     IommuGetInfo(Error),
88     #[error("failed to attach device to pKVM pvIOMMU: {0}")]
89     KvmPviommuSetConfig(Error),
90     #[error("failed to set KVM vfio device's attribute: {0}")]
91     KvmSetDeviceAttr(Error),
92     #[error("AddressAllocator is unavailable")]
93     NoRescAlloc,
94     #[error("failed to open /dev/vfio/vfio container: {0}")]
95     OpenContainer(io::Error),
96     #[error("failed to open {1} group: {0}")]
97     OpenGroup(io::Error, String),
98     #[error("failed to read {1} link: {0}")]
99     ReadLink(io::Error, PathBuf),
100     #[error("resources error: {0}")]
101     Resources(ResourcesError),
102     #[error("unknown vfio device type (flags: {0:#x})")]
103     UnknownDeviceType(u32),
104     #[error("failed to call vfio device's ACPI _DSM: {0}")]
105     VfioAcpiDsm(Error),
106     #[error("failed to disable vfio deviece's acpi notification: {0}")]
107     VfioAcpiNotificationDisable(Error),
108     #[error("failed to enable vfio deviece's acpi notification: {0}")]
109     VfioAcpiNotificationEnable(Error),
110     #[error("failed to test vfio deviece's acpi notification: {0}")]
111     VfioAcpiNotificationTest(Error),
112     #[error(
113         "vfio API version doesn't match with VFIO_API_VERSION defined in vfio_sys/src/vfio.rs"
114     )]
115     VfioApiVersion,
116     #[error("failed to get vfio device's info or info doesn't match: {0}")]
117     VfioDeviceGetInfo(Error),
118     #[error("failed to get vfio device's region info: {0}")]
119     VfioDeviceGetRegionInfo(Error),
120     #[error("container doesn't support IOMMU driver type {0:?}")]
121     VfioIommuSupport(IommuType),
122     #[error("failed to disable vfio deviece's irq: {0}")]
123     VfioIrqDisable(Error),
124     #[error("failed to enable vfio deviece's irq: {0}")]
125     VfioIrqEnable(Error),
126     #[error("failed to mask vfio deviece's irq: {0}")]
127     VfioIrqMask(Error),
128     #[error("failed to unmask vfio deviece's irq: {0}")]
129     VfioIrqUnmask(Error),
130     #[error("failed to enter vfio deviece's low power state: {0}")]
131     VfioPmLowPowerEnter(Error),
132     #[error("failed to exit vfio deviece's low power state: {0}")]
133     VfioPmLowPowerExit(Error),
134 }
135 
136 type Result<T> = std::result::Result<T, VfioError>;
137 
get_error() -> Error138 fn get_error() -> Error {
139     Error::last()
140 }
141 
142 static KVM_VFIO_FILE: OnceCell<SafeDescriptor> = OnceCell::new();
143 
144 #[derive(Copy, Clone, Debug, PartialEq, Eq)]
145 pub enum VfioDeviceType {
146     Pci,
147     Platform,
148 }
149 
150 enum KvmVfioGroupOps {
151     Add,
152     Delete,
153 }
154 
155 #[derive(Debug)]
156 pub struct KvmVfioPviommu {
157     file: File,
158 }
159 
160 impl KvmVfioPviommu {
new(vm: &impl Vm) -> Result<Self>161     pub fn new(vm: &impl Vm) -> Result<Self> {
162         cfg_if! {
163             if #[cfg(all(target_os = "android", target_arch = "aarch64"))] {
164                 let file = Self::ioctl_kvm_dev_vfio_pviommu_attach(vm)?;
165 
166                 Ok(Self { file })
167             } else {
168                 let _ = vm;
169                 unimplemented!()
170             }
171         }
172     }
173 
attach<T: AsRawDescriptor>(&self, device: &T, sid_idx: u32, vsid: u32) -> Result<()>174     pub fn attach<T: AsRawDescriptor>(&self, device: &T, sid_idx: u32, vsid: u32) -> Result<()> {
175         cfg_if! {
176             if #[cfg(all(target_os = "android", target_arch = "aarch64"))] {
177                 self.ioctl_kvm_pviommu_set_config(device, sid_idx, vsid)
178             } else {
179                 let _ = device;
180                 let _ = sid_idx;
181                 let _ = vsid;
182                 unimplemented!()
183             }
184         }
185     }
186 
id(&self) -> u32187     pub fn id(&self) -> u32 {
188         let fd = self.as_raw_descriptor();
189         // Guests identify pvIOMMUs to the hypervisor using the corresponding VMM FDs.
190         fd.try_into().unwrap()
191     }
192 
get_sid_count<T: AsRawDescriptor>(vm: &impl Vm, device: &T) -> Result<u32>193     pub fn get_sid_count<T: AsRawDescriptor>(vm: &impl Vm, device: &T) -> Result<u32> {
194         cfg_if! {
195             if #[cfg(all(target_os = "android", target_arch = "aarch64"))] {
196                 let info = Self::ioctl_kvm_dev_vfio_pviommu_get_info(vm, device)?;
197 
198                 Ok(info.nr_sids)
199             } else {
200                 let _ = vm;
201                 let _ = device;
202                 unimplemented!()
203             }
204         }
205     }
206 
207     #[cfg(all(target_os = "android", target_arch = "aarch64"))]
ioctl_kvm_dev_vfio_pviommu_attach(vm: &impl Vm) -> Result<File>208     fn ioctl_kvm_dev_vfio_pviommu_attach(vm: &impl Vm) -> Result<File> {
209         let kvm_vfio_file = KVM_VFIO_FILE
210             .get_or_try_init(|| vm.create_device(DeviceKind::Vfio))
211             .map_err(VfioError::CreateVfioKvmDevice)?;
212 
213         let vfio_dev_attr = kvm_sys::kvm_device_attr {
214             flags: 0,
215             group: kvm_sys::KVM_DEV_VFIO_PVIOMMU,
216             attr: kvm_sys::KVM_DEV_VFIO_PVIOMMU_ATTACH as u64,
217             addr: 0,
218         };
219 
220         // SAFETY:
221         // Safe as we are the owner of vfio_dev_attr, which is valid.
222         let ret =
223             unsafe { ioctl_with_ref(kvm_vfio_file, kvm_sys::KVM_SET_DEVICE_ATTR, &vfio_dev_attr) };
224 
225         if ret < 0 {
226             Err(VfioError::KvmSetDeviceAttr(get_error()))
227         } else {
228             // SAFETY: Safe as we verify the return value.
229             Ok(unsafe { File::from_raw_descriptor(ret) })
230         }
231     }
232 
233     #[cfg(all(target_os = "android", target_arch = "aarch64"))]
ioctl_kvm_pviommu_set_config<T: AsRawDescriptor>( &self, device: &T, sid_idx: u32, vsid: u32, ) -> Result<()>234     fn ioctl_kvm_pviommu_set_config<T: AsRawDescriptor>(
235         &self,
236         device: &T,
237         sid_idx: u32,
238         vsid: u32,
239     ) -> Result<()> {
240         let config = kvm_sys::kvm_vfio_iommu_config {
241             device_fd: device.as_raw_descriptor(),
242             sid_idx,
243             vsid,
244         };
245 
246         // SAFETY:
247         // Safe as we are the owner of device and config which are valid, and we verify the return
248         // value.
249         let ret = unsafe { ioctl_with_ref(self, kvm_sys::KVM_PVIOMMU_SET_CONFIG, &config) };
250 
251         if ret < 0 {
252             Err(VfioError::KvmPviommuSetConfig(get_error()))
253         } else {
254             Ok(())
255         }
256     }
257 
258     #[cfg(all(target_os = "android", target_arch = "aarch64"))]
ioctl_kvm_dev_vfio_pviommu_get_info<T: AsRawDescriptor>( vm: &impl Vm, device: &T, ) -> Result<kvm_sys::kvm_vfio_iommu_info>259     fn ioctl_kvm_dev_vfio_pviommu_get_info<T: AsRawDescriptor>(
260         vm: &impl Vm,
261         device: &T,
262     ) -> Result<kvm_sys::kvm_vfio_iommu_info> {
263         let kvm_vfio_file = KVM_VFIO_FILE
264             .get_or_try_init(|| vm.create_device(DeviceKind::Vfio))
265             .map_err(VfioError::CreateVfioKvmDevice)?;
266 
267         let mut info = kvm_sys::kvm_vfio_iommu_info {
268             device_fd: device.as_raw_descriptor(),
269             nr_sids: 0,
270         };
271 
272         let vfio_dev_attr = kvm_sys::kvm_device_attr {
273             flags: 0,
274             group: kvm_sys::KVM_DEV_VFIO_PVIOMMU,
275             attr: kvm_sys::KVM_DEV_VFIO_PVIOMMU_GET_INFO as u64,
276             addr: addr_of_mut!(info) as usize as u64,
277         };
278 
279         // SAFETY:
280         // Safe as we are the owner of vfio_dev_attr, which is valid.
281         let ret =
282             unsafe { ioctl_with_ref(kvm_vfio_file, kvm_sys::KVM_SET_DEVICE_ATTR, &vfio_dev_attr) };
283 
284         if ret < 0 {
285             Err(VfioError::KvmSetDeviceAttr(get_error()))
286         } else {
287             Ok(info)
288         }
289     }
290 }
291 
292 impl AsRawDescriptor for KvmVfioPviommu {
as_raw_descriptor(&self) -> RawDescriptor293     fn as_raw_descriptor(&self) -> RawDescriptor {
294         self.file.as_raw_descriptor()
295     }
296 }
297 
298 #[repr(u32)]
299 #[derive(Copy, Clone, Debug, PartialEq, Eq)]
300 pub enum IommuType {
301     Type1V2 = VFIO_TYPE1v2_IOMMU,
302     PkvmPviommu = VFIO_PKVM_PVIOMMU,
303     // ChromeOS specific vfio_iommu_type1 implementation that is optimized for
304     // small, dynamic mappings. For clients which create large, relatively
305     // static mappings, Type1V2 is still preferred.
306     //
307     // See crrev.com/c/3593528 for the implementation.
308     Type1ChromeOS = 100001,
309 }
310 
311 /// VfioContainer contain multi VfioGroup, and delegate an IOMMU domain table
312 pub struct VfioContainer {
313     container: File,
314     groups: HashMap<u32, Arc<Mutex<VfioGroup>>>,
315     iommu_type: Option<IommuType>,
316 }
317 
extract_vfio_struct<T>(bytes: &[u8], offset: usize) -> Option<T> where T: FromBytes,318 fn extract_vfio_struct<T>(bytes: &[u8], offset: usize) -> Option<T>
319 where
320     T: FromBytes,
321 {
322     Some(T::read_from_prefix(bytes.get(offset..)?).ok()?.0)
323 }
324 
325 const VFIO_API_VERSION: u8 = 0;
326 impl VfioContainer {
new() -> Result<Self>327     pub fn new() -> Result<Self> {
328         let container = OpenOptions::new()
329             .read(true)
330             .write(true)
331             .open("/dev/vfio/vfio")
332             .map_err(VfioError::OpenContainer)?;
333 
334         Self::new_from_container(container)
335     }
336 
337     // Construct a VfioContainer from an exist container file.
new_from_container(container: File) -> Result<Self>338     pub fn new_from_container(container: File) -> Result<Self> {
339         // SAFETY:
340         // Safe as file is vfio container descriptor and ioctl is defined by kernel.
341         let version = unsafe { ioctl(&container, VFIO_GET_API_VERSION) };
342         if version as u8 != VFIO_API_VERSION {
343             return Err(VfioError::VfioApiVersion);
344         }
345 
346         Ok(VfioContainer {
347             container,
348             groups: HashMap::new(),
349             iommu_type: None,
350         })
351     }
352 
is_group_set(&self, group_id: u32) -> bool353     fn is_group_set(&self, group_id: u32) -> bool {
354         self.groups.contains_key(&group_id)
355     }
356 
check_extension(&self, val: IommuType) -> bool357     fn check_extension(&self, val: IommuType) -> bool {
358         // SAFETY:
359         // Safe as file is vfio container and make sure val is valid.
360         let ret = unsafe { ioctl_with_val(self, VFIO_CHECK_EXTENSION, val as c_ulong) };
361         ret != 0
362     }
363 
set_iommu(&mut self, val: IommuType) -> i32364     fn set_iommu(&mut self, val: IommuType) -> i32 {
365         // SAFETY:
366         // Safe as file is vfio container and make sure val is valid.
367         unsafe { ioctl_with_val(self, VFIO_SET_IOMMU, val as c_ulong) }
368     }
369 
set_iommu_checked(&mut self, val: IommuType) -> Result<()>370     fn set_iommu_checked(&mut self, val: IommuType) -> Result<()> {
371         if !self.check_extension(val) {
372             Err(VfioError::VfioIommuSupport(val))
373         } else if self.set_iommu(val) != 0 {
374             Err(VfioError::ContainerSetIOMMU(val, get_error()))
375         } else {
376             self.iommu_type = Some(val);
377             Ok(())
378         }
379     }
380 
381     /// # Safety
382     ///
383     /// The caller is responsible for determining the safety of the VFIO_IOMMU_MAP_DMA ioctl.
vfio_dma_map( &self, iova: u64, size: u64, user_addr: u64, write_en: bool, ) -> Result<()>384     pub unsafe fn vfio_dma_map(
385         &self,
386         iova: u64,
387         size: u64,
388         user_addr: u64,
389         write_en: bool,
390     ) -> Result<()> {
391         match self
392             .iommu_type
393             .expect("vfio_dma_map called before configuring IOMMU")
394         {
395             IommuType::Type1V2 | IommuType::Type1ChromeOS => {
396                 self.vfio_iommu_type1_dma_map(iova, size, user_addr, write_en)
397             }
398             IommuType::PkvmPviommu => Err(VfioError::InvalidOperation),
399         }
400     }
401 
402     /// # Safety
403     ///
404     /// The caller is responsible for determining the safety of the VFIO_IOMMU_MAP_DMA ioctl.
vfio_iommu_type1_dma_map( &self, iova: u64, size: u64, user_addr: u64, write_en: bool, ) -> Result<()>405     unsafe fn vfio_iommu_type1_dma_map(
406         &self,
407         iova: u64,
408         size: u64,
409         user_addr: u64,
410         write_en: bool,
411     ) -> Result<()> {
412         let mut dma_map = vfio_iommu_type1_dma_map {
413             argsz: mem::size_of::<vfio_iommu_type1_dma_map>() as u32,
414             flags: VFIO_DMA_MAP_FLAG_READ,
415             vaddr: user_addr,
416             iova,
417             size,
418         };
419 
420         if write_en {
421             dma_map.flags |= VFIO_DMA_MAP_FLAG_WRITE;
422         }
423 
424         let ret = ioctl_with_ref(self, VFIO_IOMMU_MAP_DMA, &dma_map);
425         if ret != 0 {
426             return Err(VfioError::IommuDmaMap(get_error()));
427         }
428 
429         Ok(())
430     }
431 
vfio_dma_unmap(&self, iova: u64, size: u64) -> Result<()>432     pub fn vfio_dma_unmap(&self, iova: u64, size: u64) -> Result<()> {
433         match self
434             .iommu_type
435             .expect("vfio_dma_unmap called before configuring IOMMU")
436         {
437             IommuType::Type1V2 | IommuType::Type1ChromeOS => {
438                 self.vfio_iommu_type1_dma_unmap(iova, size)
439             }
440             IommuType::PkvmPviommu => Err(VfioError::InvalidOperation),
441         }
442     }
443 
vfio_iommu_type1_dma_unmap(&self, iova: u64, size: u64) -> Result<()>444     fn vfio_iommu_type1_dma_unmap(&self, iova: u64, size: u64) -> Result<()> {
445         let mut dma_unmap = vfio_iommu_type1_dma_unmap {
446             argsz: mem::size_of::<vfio_iommu_type1_dma_unmap>() as u32,
447             flags: 0,
448             iova,
449             size,
450             ..Default::default()
451         };
452 
453         // SAFETY:
454         // Safe as file is vfio container, dma_unmap is constructed by us, and
455         // we check the return value
456         let ret = unsafe { ioctl_with_mut_ref(self, VFIO_IOMMU_UNMAP_DMA, &mut dma_unmap) };
457         if ret != 0 || dma_unmap.size != size {
458             return Err(VfioError::IommuDmaUnmap(get_error()));
459         }
460 
461         Ok(())
462     }
463 
vfio_get_iommu_page_size_mask(&self) -> Result<u64>464     pub fn vfio_get_iommu_page_size_mask(&self) -> Result<u64> {
465         match self
466             .iommu_type
467             .expect("vfio_get_iommu_page_size_mask called before configuring IOMMU")
468         {
469             IommuType::Type1V2 | IommuType::Type1ChromeOS => {
470                 self.vfio_iommu_type1_get_iommu_page_size_mask()
471             }
472             IommuType::PkvmPviommu => Ok(0),
473         }
474     }
475 
vfio_iommu_type1_get_iommu_page_size_mask(&self) -> Result<u64>476     fn vfio_iommu_type1_get_iommu_page_size_mask(&self) -> Result<u64> {
477         let mut iommu_info = vfio_iommu_type1_info {
478             argsz: mem::size_of::<vfio_iommu_type1_info>() as u32,
479             flags: 0,
480             iova_pgsizes: 0,
481             ..Default::default()
482         };
483 
484         // SAFETY:
485         // Safe as file is vfio container, iommu_info has valid values,
486         // and we check the return value
487         let ret = unsafe { ioctl_with_mut_ref(self, VFIO_IOMMU_GET_INFO, &mut iommu_info) };
488         if ret != 0 || (iommu_info.flags & VFIO_IOMMU_INFO_PGSIZES) == 0 {
489             return Err(VfioError::IommuGetInfo(get_error()));
490         }
491 
492         Ok(iommu_info.iova_pgsizes)
493     }
494 
vfio_iommu_iova_get_iova_ranges(&self) -> Result<Vec<AddressRange>>495     pub fn vfio_iommu_iova_get_iova_ranges(&self) -> Result<Vec<AddressRange>> {
496         match self
497             .iommu_type
498             .expect("vfio_iommu_iova_get_iova_ranges called before configuring IOMMU")
499         {
500             IommuType::Type1V2 | IommuType::Type1ChromeOS => {
501                 self.vfio_iommu_type1_get_iova_ranges()
502             }
503             IommuType::PkvmPviommu => Ok(Vec::new()),
504         }
505     }
506 
vfio_iommu_type1_get_iova_ranges(&self) -> Result<Vec<AddressRange>>507     fn vfio_iommu_type1_get_iova_ranges(&self) -> Result<Vec<AddressRange>> {
508         // Query the buffer size needed fetch the capabilities.
509         let mut iommu_info_argsz = vfio_iommu_type1_info {
510             argsz: mem::size_of::<vfio_iommu_type1_info>() as u32,
511             flags: 0,
512             iova_pgsizes: 0,
513             ..Default::default()
514         };
515 
516         // SAFETY:
517         // Safe as file is vfio container, iommu_info_argsz has valid values,
518         // and we check the return value
519         let ret = unsafe { ioctl_with_mut_ref(self, VFIO_IOMMU_GET_INFO, &mut iommu_info_argsz) };
520         if ret != 0 {
521             return Err(VfioError::IommuGetInfo(get_error()));
522         }
523 
524         if (iommu_info_argsz.flags & VFIO_IOMMU_INFO_CAPS) == 0 {
525             return Err(VfioError::IommuGetCapInfo);
526         }
527 
528         let mut iommu_info = vec_with_array_field::<vfio_iommu_type1_info, u8>(
529             iommu_info_argsz.argsz as usize - mem::size_of::<vfio_iommu_type1_info>(),
530         );
531         iommu_info[0].argsz = iommu_info_argsz.argsz;
532         let ret =
533             // SAFETY:
534             // Safe as file is vfio container, iommu_info has valid values,
535             // and we check the return value
536             unsafe { ioctl_with_mut_ptr(self, VFIO_IOMMU_GET_INFO, iommu_info.as_mut_ptr()) };
537         if ret != 0 {
538             return Err(VfioError::IommuGetInfo(get_error()));
539         }
540 
541         // SAFETY:
542         // Safe because we initialized iommu_info with enough space, u8 has less strict
543         // alignment, and since it will no longer be mutated.
544         let info_bytes = unsafe {
545             std::slice::from_raw_parts(
546                 iommu_info.as_ptr() as *const u8,
547                 iommu_info_argsz.argsz as usize,
548             )
549         };
550 
551         if (iommu_info[0].flags & VFIO_IOMMU_INFO_CAPS) == 0 {
552             return Err(VfioError::IommuGetCapInfo);
553         }
554 
555         let mut offset = iommu_info[0].cap_offset as usize;
556         while offset != 0 {
557             let header = extract_vfio_struct::<vfio_info_cap_header>(info_bytes, offset)
558                 .ok_or(VfioError::IommuGetCapInfo)?;
559 
560             if header.id == VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE as u16 && header.version == 1 {
561                 let iova_header =
562                     extract_vfio_struct::<vfio_iommu_type1_info_cap_iova_range_header>(
563                         info_bytes, offset,
564                     )
565                     .ok_or(VfioError::IommuGetCapInfo)?;
566                 let range_offset = offset + mem::size_of::<vfio_iommu_type1_info_cap_iova_range>();
567                 let mut ret = Vec::new();
568                 for i in 0..iova_header.nr_iovas {
569                     ret.push(
570                         extract_vfio_struct::<vfio_iova_range>(
571                             info_bytes,
572                             range_offset + i as usize * mem::size_of::<vfio_iova_range>(),
573                         )
574                         .ok_or(VfioError::IommuGetCapInfo)?,
575                     );
576                 }
577                 return Ok(ret
578                     .iter()
579                     .map(|range| AddressRange {
580                         start: range.start,
581                         end: range.end,
582                     })
583                     .collect());
584             }
585             offset = header.next as usize;
586         }
587 
588         Err(VfioError::IommuGetCapInfo)
589     }
590 
set_iommu_from(&mut self, iommu_dev: IommuDevType) -> Result<()>591     fn set_iommu_from(&mut self, iommu_dev: IommuDevType) -> Result<()> {
592         match iommu_dev {
593             IommuDevType::CoIommu | IommuDevType::VirtioIommu => {
594                 // If we expect granular, dynamic mappings, try the ChromeOS Type1ChromeOS first,
595                 // then fall back to upstream versions.
596                 self.set_iommu_checked(IommuType::Type1ChromeOS)
597                     .or_else(|_| self.set_iommu_checked(IommuType::Type1V2))
598             }
599             IommuDevType::NoIommu => self.set_iommu_checked(IommuType::Type1V2),
600             IommuDevType::PkvmPviommu => self.set_iommu_checked(IommuType::PkvmPviommu),
601         }
602     }
603 
get_group_with_vm( &mut self, id: u32, vm: &impl Vm, iommu_dev: IommuDevType, ) -> Result<Arc<Mutex<VfioGroup>>>604     fn get_group_with_vm(
605         &mut self,
606         id: u32,
607         vm: &impl Vm,
608         iommu_dev: IommuDevType,
609     ) -> Result<Arc<Mutex<VfioGroup>>> {
610         if let Some(group) = self.groups.get(&id) {
611             return Ok(group.clone());
612         }
613 
614         let group = Arc::new(Mutex::new(VfioGroup::new(self, id)?));
615         if self.groups.is_empty() {
616             self.set_iommu_from(iommu_dev)?;
617             // Before the first group is added into container, do once per container
618             // initialization. Both coiommu and virtio-iommu rely on small, dynamic
619             // mappings. However, if an iommu is not enabled, then we map the entirety
620             // of guest memory as a small number of large, static mappings.
621             match iommu_dev {
622                 IommuDevType::CoIommu | IommuDevType::PkvmPviommu | IommuDevType::VirtioIommu => {}
623                 IommuDevType::NoIommu => {
624                     for region in vm.get_memory().regions() {
625                         // SAFETY:
626                         // Safe because the guest regions are guaranteed not to overlap
627                         unsafe {
628                             self.vfio_dma_map(
629                                 region.guest_addr.0,
630                                 region.size as u64,
631                                 region.host_addr as u64,
632                                 true,
633                             )
634                         }?;
635                     }
636                 }
637             }
638         }
639 
640         let kvm_vfio_file = KVM_VFIO_FILE
641             .get_or_try_init(|| vm.create_device(DeviceKind::Vfio))
642             .map_err(VfioError::CreateVfioKvmDevice)?;
643         group
644             .lock()
645             .kvm_device_set_group(kvm_vfio_file, KvmVfioGroupOps::Add)?;
646 
647         self.groups.insert(id, group.clone());
648 
649         Ok(group)
650     }
651 
get_group(&mut self, id: u32) -> Result<Arc<Mutex<VfioGroup>>>652     fn get_group(&mut self, id: u32) -> Result<Arc<Mutex<VfioGroup>>> {
653         if let Some(group) = self.groups.get(&id) {
654             return Ok(group.clone());
655         }
656 
657         let group = Arc::new(Mutex::new(VfioGroup::new(self, id)?));
658 
659         if self.groups.is_empty() {
660             // Before the first group is added into container, do once per
661             // container initialization.
662             self.set_iommu_checked(IommuType::Type1V2)?;
663         }
664 
665         self.groups.insert(id, group.clone());
666         Ok(group)
667     }
668 
remove_group(&mut self, id: u32, reduce: bool)669     fn remove_group(&mut self, id: u32, reduce: bool) {
670         let mut remove = false;
671 
672         if let Some(group) = self.groups.get(&id) {
673             if reduce {
674                 group.lock().reduce_device_num();
675             }
676             if group.lock().device_num() == 0 {
677                 let kvm_vfio_file = KVM_VFIO_FILE.get().expect("kvm vfio file isn't created");
678                 if group
679                     .lock()
680                     .kvm_device_set_group(kvm_vfio_file, KvmVfioGroupOps::Delete)
681                     .is_err()
682                 {
683                     warn!("failing in remove vfio group from kvm device");
684                 }
685                 remove = true;
686             }
687         }
688 
689         if remove {
690             self.groups.remove(&id);
691         }
692     }
693 
clone_as_raw_descriptor(&self) -> Result<RawDescriptor>694     pub fn clone_as_raw_descriptor(&self) -> Result<RawDescriptor> {
695         // SAFETY: this call is safe because it doesn't modify any memory and we
696         // check the return value.
697         let raw_descriptor = unsafe { libc::dup(self.container.as_raw_descriptor()) };
698         if raw_descriptor < 0 {
699             Err(VfioError::ContainerDupError)
700         } else {
701             Ok(raw_descriptor)
702         }
703     }
704 
705     // Gets group ids for all groups in the container.
group_ids(&self) -> Vec<&u32>706     pub fn group_ids(&self) -> Vec<&u32> {
707         self.groups.keys().collect()
708     }
709 }
710 
711 impl AsRawDescriptor for VfioContainer {
as_raw_descriptor(&self) -> RawDescriptor712     fn as_raw_descriptor(&self) -> RawDescriptor {
713         self.container.as_raw_descriptor()
714     }
715 }
716 
717 struct VfioGroup {
718     group: File,
719     device_num: u32,
720 }
721 
722 impl VfioGroup {
new(container: &VfioContainer, id: u32) -> Result<Self>723     fn new(container: &VfioContainer, id: u32) -> Result<Self> {
724         let group_path = format!("/dev/vfio/{}", id);
725         let group_file = OpenOptions::new()
726             .read(true)
727             .write(true)
728             .open(Path::new(&group_path))
729             .map_err(|e| VfioError::OpenGroup(e, group_path))?;
730 
731         let mut group_status = vfio_group_status {
732             argsz: mem::size_of::<vfio_group_status>() as u32,
733             flags: 0,
734         };
735         let mut ret =
736             // SAFETY:
737             // Safe as we are the owner of group_file and group_status which are valid value.
738             unsafe { ioctl_with_mut_ref(&group_file, VFIO_GROUP_GET_STATUS, &mut group_status) };
739         if ret < 0 {
740             return Err(VfioError::GetGroupStatus(get_error()));
741         }
742 
743         if group_status.flags != VFIO_GROUP_FLAGS_VIABLE {
744             return Err(VfioError::GroupViable);
745         }
746 
747         let container_raw_descriptor = container.as_raw_descriptor();
748         // SAFETY:
749         // Safe as we are the owner of group_file and container_raw_descriptor which are valid
750         // value, and we verify the ret value
751         ret = unsafe {
752             ioctl_with_ref(
753                 &group_file,
754                 VFIO_GROUP_SET_CONTAINER,
755                 &container_raw_descriptor,
756             )
757         };
758         if ret < 0 {
759             return Err(VfioError::GroupSetContainer(get_error()));
760         }
761 
762         Ok(VfioGroup {
763             group: group_file,
764             device_num: 0,
765         })
766     }
767 
get_group_id<P: AsRef<Path>>(sysfspath: P) -> Result<u32>768     fn get_group_id<P: AsRef<Path>>(sysfspath: P) -> Result<u32> {
769         let mut uuid_path = PathBuf::new();
770         uuid_path.push(sysfspath);
771         uuid_path.push("iommu_group");
772         let group_path = uuid_path
773             .read_link()
774             .map_err(|e| VfioError::ReadLink(e, uuid_path))?;
775         let group_osstr = group_path.file_name().ok_or(VfioError::InvalidPath)?;
776         let group_str = group_osstr.to_str().ok_or(VfioError::InvalidPath)?;
777         let group_id = group_str
778             .parse::<u32>()
779             .map_err(|_| VfioError::InvalidPath)?;
780 
781         Ok(group_id)
782     }
783 
kvm_device_set_group( &self, kvm_vfio_file: &SafeDescriptor, ops: KvmVfioGroupOps, ) -> Result<()>784     fn kvm_device_set_group(
785         &self,
786         kvm_vfio_file: &SafeDescriptor,
787         ops: KvmVfioGroupOps,
788     ) -> Result<()> {
789         let group_descriptor = self.as_raw_descriptor();
790         let group_descriptor_ptr = &group_descriptor as *const i32;
791         let vfio_dev_attr = match ops {
792             KvmVfioGroupOps::Add => kvm_sys::kvm_device_attr {
793                 flags: 0,
794                 group: kvm_sys::KVM_DEV_VFIO_GROUP,
795                 attr: kvm_sys::KVM_DEV_VFIO_GROUP_ADD as u64,
796                 addr: group_descriptor_ptr as u64,
797             },
798             KvmVfioGroupOps::Delete => kvm_sys::kvm_device_attr {
799                 flags: 0,
800                 group: kvm_sys::KVM_DEV_VFIO_GROUP,
801                 attr: kvm_sys::KVM_DEV_VFIO_GROUP_DEL as u64,
802                 addr: group_descriptor_ptr as u64,
803             },
804         };
805 
806         // SAFETY:
807         // Safe as we are the owner of vfio_dev_descriptor and vfio_dev_attr which are valid value,
808         // and we verify the return value.
809         if 0 != unsafe {
810             ioctl_with_ref(kvm_vfio_file, kvm_sys::KVM_SET_DEVICE_ATTR, &vfio_dev_attr)
811         } {
812             return Err(VfioError::KvmSetDeviceAttr(get_error()));
813         }
814 
815         Ok(())
816     }
817 
get_device(&self, name: &str) -> Result<File>818     fn get_device(&self, name: &str) -> Result<File> {
819         let path: CString = CString::new(name.as_bytes()).expect("CString::new() failed");
820         let path_ptr = path.as_ptr();
821 
822         // SAFETY:
823         // Safe as we are the owner of self and path_ptr which are valid value.
824         let ret = unsafe { ioctl_with_ptr(self, VFIO_GROUP_GET_DEVICE_FD, path_ptr) };
825         if ret < 0 {
826             return Err(VfioError::GroupGetDeviceFD(get_error()));
827         }
828 
829         // SAFETY:
830         // Safe as ret is valid descriptor
831         Ok(unsafe { File::from_raw_descriptor(ret) })
832     }
833 
add_device_num(&mut self)834     fn add_device_num(&mut self) {
835         self.device_num += 1;
836     }
837 
reduce_device_num(&mut self)838     fn reduce_device_num(&mut self) {
839         self.device_num -= 1;
840     }
841 
device_num(&self) -> u32842     fn device_num(&self) -> u32 {
843         self.device_num
844     }
845 }
846 
847 impl AsRawDescriptor for VfioGroup {
as_raw_descriptor(&self) -> RawDescriptor848     fn as_raw_descriptor(&self) -> RawDescriptor {
849         self.group.as_raw_descriptor()
850     }
851 }
852 
853 /// A helper struct for managing VFIO containers
854 #[derive(Default)]
855 pub struct VfioContainerManager {
856     /// One VFIO container shared by all VFIO devices that don't attach to any IOMMU device.
857     no_iommu_container: Option<Arc<Mutex<VfioContainer>>>,
858 
859     /// For IOMMU enabled devices, all VFIO groups that share the same IOVA space are managed by
860     /// one VFIO container.
861     iommu_containers: Vec<Arc<Mutex<VfioContainer>>>,
862 
863     /// One VFIO container shared by all VFIO devices that attach to the CoIOMMU device.
864     coiommu_container: Option<Arc<Mutex<VfioContainer>>>,
865 
866     /// One VFIO container shared by all VFIO devices that attach to pKVM.
867     pkvm_iommu_container: Option<Arc<Mutex<VfioContainer>>>,
868 }
869 
870 impl VfioContainerManager {
new() -> Self871     pub fn new() -> Self {
872         Self::default()
873     }
874 
875     /// The single place to create a VFIO container for a PCI endpoint.
876     ///
877     /// The policy to determine whether an individual or a shared VFIO container
878     /// will be created for this device is governed by the physical PCI topology,
879     /// and the argument iommu_type.
880     ///
881     ///  # Arguments
882     ///
883     ///  * `sysfspath` - the path to the PCI device, e.g. /sys/bus/pci/devices/0000:02:00.0
884     ///  * `iommu_type` - which type of IOMMU is enabled on this device
get_container<P: AsRef<Path>>( &mut self, iommu_type: IommuDevType, sysfspath: Option<P>, ) -> Result<Arc<Mutex<VfioContainer>>>885     pub fn get_container<P: AsRef<Path>>(
886         &mut self,
887         iommu_type: IommuDevType,
888         sysfspath: Option<P>,
889     ) -> Result<Arc<Mutex<VfioContainer>>> {
890         match iommu_type {
891             IommuDevType::NoIommu => {
892                 // One VFIO container is used for all IOMMU disabled groups.
893                 if let Some(container) = &self.no_iommu_container {
894                     Ok(container.clone())
895                 } else {
896                     let container = Arc::new(Mutex::new(VfioContainer::new()?));
897                     self.no_iommu_container = Some(container.clone());
898                     Ok(container)
899                 }
900             }
901             IommuDevType::VirtioIommu => {
902                 let path = sysfspath.ok_or(VfioError::InvalidPath)?;
903                 let group_id = VfioGroup::get_group_id(path)?;
904 
905                 // One VFIO container is used for all devices that belong to one VFIO group.
906                 // NOTE: vfio_wrapper relies on each container containing exactly one group.
907                 if let Some(container) = self
908                     .iommu_containers
909                     .iter()
910                     .find(|container| container.lock().is_group_set(group_id))
911                 {
912                     Ok(container.clone())
913                 } else {
914                     let container = Arc::new(Mutex::new(VfioContainer::new()?));
915                     self.iommu_containers.push(container.clone());
916                     Ok(container)
917                 }
918             }
919             IommuDevType::CoIommu => {
920                 // One VFIO container is used for devices attached to CoIommu
921                 if let Some(container) = &self.coiommu_container {
922                     Ok(container.clone())
923                 } else {
924                     let container = Arc::new(Mutex::new(VfioContainer::new()?));
925                     self.coiommu_container = Some(container.clone());
926                     Ok(container)
927                 }
928             }
929             IommuDevType::PkvmPviommu => {
930                 // One VFIO container is used for devices attached to pKVM
931                 if let Some(container) = &self.pkvm_iommu_container {
932                     Ok(container.clone())
933                 } else {
934                     let container = Arc::new(Mutex::new(VfioContainer::new()?));
935                     self.pkvm_iommu_container = Some(container.clone());
936                     Ok(container)
937                 }
938             }
939         }
940     }
941 }
942 
943 /// Vfio Irq type used to enable/disable/mask/unmask vfio irq
944 pub enum VfioIrqType {
945     Intx,
946     Msi,
947     Msix,
948 }
949 
950 /// Vfio Irq information used to assign and enable/disable/mask/unmask vfio irq
951 pub struct VfioIrq {
952     pub flags: u32,
953     pub index: u32,
954 }
955 
956 /// Address on VFIO memory region.
957 #[derive(Debug, Default, Clone)]
958 pub struct VfioRegionAddr {
959     /// region number.
960     pub index: usize,
961     /// offset in the region.
962     pub addr: u64,
963 }
964 
965 #[derive(Debug)]
966 pub struct VfioRegion {
967     // flags for this region: read/write/mmap
968     flags: u32,
969     size: u64,
970     // region offset used to read/write with vfio device descriptor
971     offset: u64,
972     // vectors for mmap offset and size
973     mmaps: Vec<vfio_region_sparse_mmap_area>,
974     // type and subtype for cap type
975     cap_info: Option<(u32, u32)>,
976 }
977 
978 /// Vfio device for exposing regions which could be read/write to kernel vfio device.
979 pub struct VfioDevice {
980     dev: File,
981     name: String,
982     container: Arc<Mutex<VfioContainer>>,
983     dev_type: VfioDeviceType,
984     group_descriptor: RawDescriptor,
985     group_id: u32,
986     // vec for vfio device's regions
987     regions: Vec<VfioRegion>,
988     num_irqs: u32,
989 
990     iova_alloc: Arc<Mutex<AddressAllocator>>,
991     dt_symbol: Option<String>,
992     pviommu: Option<(Arc<Mutex<KvmVfioPviommu>>, Vec<u32>)>,
993 }
994 
995 impl VfioDevice {
996     /// Create a new vfio device, then guest read/write on this device could be
997     /// transfered into kernel vfio.
998     /// sysfspath specify the vfio device path in sys file system.
new_passthrough<P: AsRef<Path>>( sysfspath: &P, vm: &impl Vm, container: Arc<Mutex<VfioContainer>>, iommu_dev: IommuDevType, dt_symbol: Option<String>, ) -> Result<Self>999     pub fn new_passthrough<P: AsRef<Path>>(
1000         sysfspath: &P,
1001         vm: &impl Vm,
1002         container: Arc<Mutex<VfioContainer>>,
1003         iommu_dev: IommuDevType,
1004         dt_symbol: Option<String>,
1005     ) -> Result<Self> {
1006         let group_id = VfioGroup::get_group_id(sysfspath)?;
1007 
1008         let group = container
1009             .lock()
1010             .get_group_with_vm(group_id, vm, iommu_dev)?;
1011         let name_osstr = sysfspath
1012             .as_ref()
1013             .file_name()
1014             .ok_or(VfioError::InvalidPath)?;
1015         let name_str = name_osstr.to_str().ok_or(VfioError::InvalidPath)?;
1016         let name = String::from(name_str);
1017         let dev = group.lock().get_device(&name)?;
1018         let (dev_info, dev_type) = Self::get_device_info(&dev)?;
1019         let regions = Self::get_regions(&dev, dev_info.num_regions)?;
1020         group.lock().add_device_num();
1021         let group_descriptor = group.lock().as_raw_descriptor();
1022 
1023         let iova_ranges = container.lock().vfio_iommu_iova_get_iova_ranges()?;
1024         let iova_alloc = AddressAllocator::new_from_list(iova_ranges, None, None)
1025             .map_err(VfioError::Resources)?;
1026 
1027         let pviommu = if matches!(iommu_dev, IommuDevType::PkvmPviommu) {
1028             // We currently have a 1-to-1 mapping between pvIOMMUs and VFIO devices.
1029             let pviommu = KvmVfioPviommu::new(vm)?;
1030 
1031             let vsids_len = KvmVfioPviommu::get_sid_count(vm, &dev)?.try_into().unwrap();
1032             let max_vsid = u32::MAX.try_into().unwrap();
1033             let random_vsids = sample(&mut thread_rng(), max_vsid, vsids_len).into_iter();
1034             let vsids = Vec::from_iter(random_vsids.map(|v| u32::try_from(v).unwrap()));
1035             for (i, vsid) in vsids.iter().enumerate() {
1036                 pviommu.attach(&dev, i.try_into().unwrap(), *vsid)?;
1037             }
1038 
1039             Some((Arc::new(Mutex::new(pviommu)), vsids))
1040         } else {
1041             None
1042         };
1043 
1044         Ok(VfioDevice {
1045             dev,
1046             name,
1047             container,
1048             dev_type,
1049             group_descriptor,
1050             group_id,
1051             regions,
1052             num_irqs: dev_info.num_irqs,
1053             iova_alloc: Arc::new(Mutex::new(iova_alloc)),
1054             dt_symbol,
1055             pviommu,
1056         })
1057     }
1058 
new<P: AsRef<Path>>( sysfspath: &P, container: Arc<Mutex<VfioContainer>>, ) -> Result<Self>1059     pub fn new<P: AsRef<Path>>(
1060         sysfspath: &P,
1061         container: Arc<Mutex<VfioContainer>>,
1062     ) -> Result<Self> {
1063         let group_id = VfioGroup::get_group_id(sysfspath)?;
1064         let group = container.lock().get_group(group_id)?;
1065         let name_osstr = sysfspath
1066             .as_ref()
1067             .file_name()
1068             .ok_or(VfioError::InvalidPath)?;
1069         let name_str = name_osstr.to_str().ok_or(VfioError::InvalidPath)?;
1070         let name = String::from(name_str);
1071 
1072         let dev = match group.lock().get_device(&name) {
1073             Ok(dev) => dev,
1074             Err(e) => {
1075                 container.lock().remove_group(group_id, false);
1076                 return Err(e);
1077             }
1078         };
1079         let (dev_info, dev_type) = match Self::get_device_info(&dev) {
1080             Ok(dev_info) => dev_info,
1081             Err(e) => {
1082                 container.lock().remove_group(group_id, false);
1083                 return Err(e);
1084             }
1085         };
1086         let regions = match Self::get_regions(&dev, dev_info.num_regions) {
1087             Ok(regions) => regions,
1088             Err(e) => {
1089                 container.lock().remove_group(group_id, false);
1090                 return Err(e);
1091             }
1092         };
1093         group.lock().add_device_num();
1094         let group_descriptor = group.lock().as_raw_descriptor();
1095 
1096         let iova_ranges = container.lock().vfio_iommu_iova_get_iova_ranges()?;
1097         let iova_alloc = AddressAllocator::new_from_list(iova_ranges, None, None)
1098             .map_err(VfioError::Resources)?;
1099 
1100         Ok(VfioDevice {
1101             dev,
1102             name,
1103             container,
1104             dev_type,
1105             group_descriptor,
1106             group_id,
1107             regions,
1108             num_irqs: dev_info.num_irqs,
1109             iova_alloc: Arc::new(Mutex::new(iova_alloc)),
1110             dt_symbol: None,
1111             pviommu: None,
1112         })
1113     }
1114 
1115     /// Returns the file for this device.
dev_file(&self) -> &File1116     pub fn dev_file(&self) -> &File {
1117         &self.dev
1118     }
1119 
1120     /// Returns PCI device name, formatted as BUS:DEVICE.FUNCTION string.
device_name(&self) -> &String1121     pub fn device_name(&self) -> &String {
1122         &self.name
1123     }
1124 
1125     /// Returns the type of this VFIO device.
device_type(&self) -> VfioDeviceType1126     pub fn device_type(&self) -> VfioDeviceType {
1127         self.dev_type
1128     }
1129 
1130     /// Returns the DT symbol (node label) of this VFIO device.
dt_symbol(&self) -> Option<&str>1131     pub fn dt_symbol(&self) -> Option<&str> {
1132         self.dt_symbol.as_deref()
1133     }
1134 
1135     /// Returns the type and indentifier (if applicable) of the IOMMU used by this VFIO device and
1136     /// its master IDs.
iommu(&self) -> Option<(IommuDevType, Option<u32>, &[u32])>1137     pub fn iommu(&self) -> Option<(IommuDevType, Option<u32>, &[u32])> {
1138         // We currently only report IommuDevType::PkvmPviommu.
1139         if let Some((ref pviommu, ref ids)) = self.pviommu {
1140             Some((
1141                 IommuDevType::PkvmPviommu,
1142                 Some(pviommu.lock().id()),
1143                 ids.as_ref(),
1144             ))
1145         } else {
1146             None
1147         }
1148     }
1149 
1150     /// enter the device's low power state
pm_low_power_enter(&self) -> Result<()>1151     pub fn pm_low_power_enter(&self) -> Result<()> {
1152         let mut device_feature = vec_with_array_field::<vfio_device_feature, u8>(0);
1153         device_feature[0].argsz = mem::size_of::<vfio_device_feature>() as u32;
1154         device_feature[0].flags = VFIO_DEVICE_FEATURE_SET | VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY;
1155         // SAFETY:
1156         // Safe as we are the owner of self and power_management which are valid value
1157         let ret = unsafe { ioctl_with_ref(&self.dev, VFIO_DEVICE_FEATURE, &device_feature[0]) };
1158         if ret < 0 {
1159             Err(VfioError::VfioPmLowPowerEnter(get_error()))
1160         } else {
1161             Ok(())
1162         }
1163     }
1164 
1165     /// enter the device's low power state with wakeup notification
pm_low_power_enter_with_wakeup(&self, wakeup_evt: Event) -> Result<()>1166     pub fn pm_low_power_enter_with_wakeup(&self, wakeup_evt: Event) -> Result<()> {
1167         let payload = vfio_device_low_power_entry_with_wakeup {
1168             wakeup_eventfd: wakeup_evt.as_raw_descriptor(),
1169             reserved: 0,
1170         };
1171         let payload_size = mem::size_of::<vfio_device_low_power_entry_with_wakeup>();
1172         let mut device_feature = vec_with_array_field::<vfio_device_feature, u8>(payload_size);
1173         device_feature[0].argsz = (mem::size_of::<vfio_device_feature>() + payload_size) as u32;
1174         device_feature[0].flags =
1175             VFIO_DEVICE_FEATURE_SET | VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY_WITH_WAKEUP;
1176         // SAFETY:
1177         // Safe as we know vfio_device_low_power_entry_with_wakeup has two 32-bit int fields
1178         unsafe {
1179             device_feature[0]
1180                 .data
1181                 .as_mut_slice(payload_size)
1182                 .copy_from_slice(
1183                     mem::transmute::<vfio_device_low_power_entry_with_wakeup, [u8; 8]>(payload)
1184                         .as_slice(),
1185                 );
1186         }
1187         // SAFETY:
1188         // Safe as we are the owner of self and power_management which are valid value
1189         let ret = unsafe { ioctl_with_ref(&self.dev, VFIO_DEVICE_FEATURE, &device_feature[0]) };
1190         if ret < 0 {
1191             Err(VfioError::VfioPmLowPowerEnter(get_error()))
1192         } else {
1193             Ok(())
1194         }
1195     }
1196 
1197     /// exit the device's low power state
pm_low_power_exit(&self) -> Result<()>1198     pub fn pm_low_power_exit(&self) -> Result<()> {
1199         let mut device_feature = vec_with_array_field::<vfio_device_feature, u8>(0);
1200         device_feature[0].argsz = mem::size_of::<vfio_device_feature>() as u32;
1201         device_feature[0].flags = VFIO_DEVICE_FEATURE_SET | VFIO_DEVICE_FEATURE_LOW_POWER_EXIT;
1202         // SAFETY:
1203         // Safe as we are the owner of self and power_management which are valid value
1204         let ret = unsafe { ioctl_with_ref(&self.dev, VFIO_DEVICE_FEATURE, &device_feature[0]) };
1205         if ret < 0 {
1206             Err(VfioError::VfioPmLowPowerExit(get_error()))
1207         } else {
1208             Ok(())
1209         }
1210     }
1211 
1212     /// call _DSM from the device's ACPI table
acpi_dsm(&self, args: &[u8]) -> Result<Vec<u8>>1213     pub fn acpi_dsm(&self, args: &[u8]) -> Result<Vec<u8>> {
1214         let count = args.len();
1215         let mut dsm = vec_with_array_field::<vfio_acpi_dsm, u8>(count);
1216         dsm[0].argsz = (mem::size_of::<vfio_acpi_dsm>() + mem::size_of_val(args)) as u32;
1217         dsm[0].padding = 0;
1218         // SAFETY:
1219         // Safe as we allocated enough space to hold args
1220         unsafe {
1221             dsm[0].args.as_mut_slice(count).clone_from_slice(args);
1222         }
1223         // SAFETY:
1224         // Safe as we are the owner of self and dsm which are valid value
1225         let ret = unsafe { ioctl_with_mut_ref(&self.dev, VFIO_DEVICE_ACPI_DSM, &mut dsm[0]) };
1226         if ret < 0 {
1227             Err(VfioError::VfioAcpiDsm(get_error()))
1228         } else {
1229             // SAFETY:
1230             // Safe as we allocated enough space to hold args
1231             let res = unsafe { dsm[0].args.as_slice(count) };
1232             Ok(res.to_vec())
1233         }
1234     }
1235 
1236     /// Enable vfio device's ACPI notifications and associate EventFD with device.
acpi_notification_evt_enable( &self, acpi_notification_eventfd: &Event, index: u32, ) -> Result<()>1237     pub fn acpi_notification_evt_enable(
1238         &self,
1239         acpi_notification_eventfd: &Event,
1240         index: u32,
1241     ) -> Result<()> {
1242         let u32_size = mem::size_of::<u32>();
1243         let count = 1;
1244 
1245         let mut irq_set = vec_with_array_field::<vfio_irq_set, u32>(count);
1246         irq_set[0].argsz = (mem::size_of::<vfio_irq_set>() + count * u32_size) as u32;
1247         irq_set[0].flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER;
1248         irq_set[0].index = index;
1249         irq_set[0].start = 0;
1250         irq_set[0].count = count as u32;
1251 
1252         // SAFETY:
1253         // It is safe as enough space is reserved through vec_with_array_field(u32)<count>.
1254         let data = unsafe { irq_set[0].data.as_mut_slice(count * u32_size) };
1255         data.copy_from_slice(&acpi_notification_eventfd.as_raw_descriptor().to_ne_bytes()[..]);
1256 
1257         // SAFETY:
1258         // Safe as we are the owner of self and irq_set which are valid value
1259         let ret = unsafe { ioctl_with_ref(&self.dev, VFIO_DEVICE_SET_IRQS, &irq_set[0]) };
1260         if ret < 0 {
1261             Err(VfioError::VfioAcpiNotificationEnable(get_error()))
1262         } else {
1263             Ok(())
1264         }
1265     }
1266 
1267     /// Disable vfio device's ACPI notification and disconnect EventFd with device.
acpi_notification_disable(&self, index: u32) -> Result<()>1268     pub fn acpi_notification_disable(&self, index: u32) -> Result<()> {
1269         let mut irq_set = vec_with_array_field::<vfio_irq_set, u32>(0);
1270         irq_set[0].argsz = mem::size_of::<vfio_irq_set>() as u32;
1271         irq_set[0].flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
1272         irq_set[0].index = index;
1273         irq_set[0].start = 0;
1274         irq_set[0].count = 0;
1275 
1276         // SAFETY:
1277         // Safe as we are the owner of self and irq_set which are valid value
1278         let ret = unsafe { ioctl_with_ref(&self.dev, VFIO_DEVICE_SET_IRQS, &irq_set[0]) };
1279         if ret < 0 {
1280             Err(VfioError::VfioAcpiNotificationDisable(get_error()))
1281         } else {
1282             Ok(())
1283         }
1284     }
1285 
1286     /// Test vfio device's ACPI notification by simulating hardware triggering.
1287     /// When the signaling mechanism is set, the VFIO_IRQ_SET_DATA_BOOL can be used with
1288     /// VFIO_IRQ_SET_ACTION_TRIGGER to perform kernel level interrupt loopback testing.
acpi_notification_test(&self, index: u32, val: u32) -> Result<()>1289     pub fn acpi_notification_test(&self, index: u32, val: u32) -> Result<()> {
1290         let u32_size = mem::size_of::<u32>();
1291         let mut irq_set = vec_with_array_field::<vfio_irq_set, u32>(1);
1292         irq_set[0].argsz = (mem::size_of::<vfio_irq_set>() + u32_size) as u32;
1293         irq_set[0].flags = VFIO_IRQ_SET_DATA_BOOL | VFIO_IRQ_SET_ACTION_TRIGGER;
1294         irq_set[0].index = index;
1295         irq_set[0].start = 0;
1296         irq_set[0].count = 1;
1297 
1298         // SAFETY:
1299         // It is safe as enough space is reserved through vec_with_array_field(u32)<count>.
1300         let data = unsafe { irq_set[0].data.as_mut_slice(u32_size) };
1301         data.copy_from_slice(&val.to_ne_bytes()[..]);
1302 
1303         // SAFETY:
1304         // Safe as we are the owner of self and irq_set which are valid value
1305         let ret = unsafe { ioctl_with_ref(&self.dev, VFIO_DEVICE_SET_IRQS, &irq_set[0]) };
1306         if ret < 0 {
1307             Err(VfioError::VfioAcpiNotificationTest(get_error()))
1308         } else {
1309             Ok(())
1310         }
1311     }
1312 
1313     /// Enable vfio device's irq and associate Irqfd Event with device.
1314     /// When MSIx is enabled, multi vectors will be supported, and vectors starting from subindex to
1315     /// subindex + descriptors length will be assigned with irqfd in the descriptors array.
1316     /// when index = VFIO_PCI_REQ_IRQ_INDEX, kernel vfio will trigger this event when physical
1317     /// device is removed.
1318     /// If descriptor is None, -1 is assigned to the irq. A value of -1 is used to either de-assign
1319     /// interrupts if already assigned or skip un-assigned interrupts.
irq_enable( &self, descriptors: &[Option<&Event>], index: u32, subindex: u32, ) -> Result<()>1320     pub fn irq_enable(
1321         &self,
1322         descriptors: &[Option<&Event>],
1323         index: u32,
1324         subindex: u32,
1325     ) -> Result<()> {
1326         let count = descriptors.len();
1327         let u32_size = mem::size_of::<u32>();
1328         let mut irq_set = vec_with_array_field::<vfio_irq_set, u32>(count);
1329         irq_set[0].argsz = (mem::size_of::<vfio_irq_set>() + count * u32_size) as u32;
1330         irq_set[0].flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER;
1331         irq_set[0].index = index;
1332         irq_set[0].start = subindex;
1333         irq_set[0].count = count as u32;
1334 
1335         // SAFETY:
1336         // irq_set.data could be none, bool or descriptor according to flags, so irq_set.data
1337         // is u8 default, here irq_set.data is descriptor as u32, so 4 default u8 are combined
1338         // together as u32. It is safe as enough space is reserved through
1339         // vec_with_array_field(u32)<count>.
1340         let mut data = unsafe { irq_set[0].data.as_mut_slice(count * u32_size) };
1341         for descriptor in descriptors.iter().take(count) {
1342             let (left, right) = data.split_at_mut(u32_size);
1343             match descriptor {
1344                 Some(fd) => left.copy_from_slice(&fd.as_raw_descriptor().to_ne_bytes()[..]),
1345                 None => left.copy_from_slice(&(-1i32).to_ne_bytes()[..]),
1346             }
1347             data = right;
1348         }
1349 
1350         // SAFETY:
1351         // Safe as we are the owner of self and irq_set which are valid value
1352         let ret = unsafe { ioctl_with_ref(&self.dev, VFIO_DEVICE_SET_IRQS, &irq_set[0]) };
1353         if ret < 0 {
1354             Err(VfioError::VfioIrqEnable(get_error()))
1355         } else {
1356             Ok(())
1357         }
1358     }
1359 
1360     /// When intx is enabled, irqfd is used to trigger a level interrupt into guest, resample irqfd
1361     /// is used to get guest EOI notification.
1362     /// When host hw generates interrupt, vfio irq handler in host kernel receive and handle it,
1363     /// this handler disable hw irq first, then trigger irqfd to inject interrupt into guest. When
1364     /// resample irqfd is triggered by guest EOI, vfio kernel could enable hw irq, so hw could
1365     /// generate another interrupts.
1366     /// This function enable resample irqfd and let vfio kernel could get EOI notification.
1367     ///
1368     /// descriptor: should be resample IrqFd.
resample_virq_enable(&self, descriptor: &Event, index: u32) -> Result<()>1369     pub fn resample_virq_enable(&self, descriptor: &Event, index: u32) -> Result<()> {
1370         let mut irq_set = vec_with_array_field::<vfio_irq_set, u32>(1);
1371         irq_set[0].argsz = (mem::size_of::<vfio_irq_set>() + mem::size_of::<u32>()) as u32;
1372         irq_set[0].flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_UNMASK;
1373         irq_set[0].index = index;
1374         irq_set[0].start = 0;
1375         irq_set[0].count = 1;
1376 
1377         {
1378             // SAFETY:
1379             // irq_set.data could be none, bool or descriptor according to flags, so irq_set.data is
1380             // u8 default, here irq_set.data is descriptor as u32, so 4 default u8 are combined
1381             // together as u32. It is safe as enough space is reserved through
1382             // vec_with_array_field(u32)<1>.
1383             let descriptors = unsafe { irq_set[0].data.as_mut_slice(4) };
1384             descriptors.copy_from_slice(&descriptor.as_raw_descriptor().to_le_bytes()[..]);
1385         }
1386 
1387         // SAFETY:
1388         // Safe as we are the owner of self and irq_set which are valid value
1389         let ret = unsafe { ioctl_with_ref(&self.dev, VFIO_DEVICE_SET_IRQS, &irq_set[0]) };
1390         if ret < 0 {
1391             Err(VfioError::VfioIrqEnable(get_error()))
1392         } else {
1393             Ok(())
1394         }
1395     }
1396 
1397     /// disable vfio device's irq and disconnect Irqfd Event with device
irq_disable(&self, index: u32) -> Result<()>1398     pub fn irq_disable(&self, index: u32) -> Result<()> {
1399         let mut irq_set = vec_with_array_field::<vfio_irq_set, u32>(0);
1400         irq_set[0].argsz = mem::size_of::<vfio_irq_set>() as u32;
1401         irq_set[0].flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
1402         irq_set[0].index = index;
1403         irq_set[0].start = 0;
1404         irq_set[0].count = 0;
1405 
1406         // SAFETY:
1407         // Safe as we are the owner of self and irq_set which are valid value
1408         let ret = unsafe { ioctl_with_ref(&self.dev, VFIO_DEVICE_SET_IRQS, &irq_set[0]) };
1409         if ret < 0 {
1410             Err(VfioError::VfioIrqDisable(get_error()))
1411         } else {
1412             Ok(())
1413         }
1414     }
1415 
1416     /// Unmask vfio device irq
irq_unmask(&self, index: u32) -> Result<()>1417     pub fn irq_unmask(&self, index: u32) -> Result<()> {
1418         let mut irq_set = vec_with_array_field::<vfio_irq_set, u32>(0);
1419         irq_set[0].argsz = mem::size_of::<vfio_irq_set>() as u32;
1420         irq_set[0].flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_UNMASK;
1421         irq_set[0].index = index;
1422         irq_set[0].start = 0;
1423         irq_set[0].count = 1;
1424 
1425         // SAFETY:
1426         // Safe as we are the owner of self and irq_set which are valid value
1427         let ret = unsafe { ioctl_with_ref(&self.dev, VFIO_DEVICE_SET_IRQS, &irq_set[0]) };
1428         if ret < 0 {
1429             Err(VfioError::VfioIrqUnmask(get_error()))
1430         } else {
1431             Ok(())
1432         }
1433     }
1434 
1435     /// Mask vfio device irq
irq_mask(&self, index: u32) -> Result<()>1436     pub fn irq_mask(&self, index: u32) -> Result<()> {
1437         let mut irq_set = vec_with_array_field::<vfio_irq_set, u32>(0);
1438         irq_set[0].argsz = mem::size_of::<vfio_irq_set>() as u32;
1439         irq_set[0].flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_MASK;
1440         irq_set[0].index = index;
1441         irq_set[0].start = 0;
1442         irq_set[0].count = 1;
1443 
1444         // SAFETY:
1445         // Safe as we are the owner of self and irq_set which are valid value
1446         let ret = unsafe { ioctl_with_ref(&self.dev, VFIO_DEVICE_SET_IRQS, &irq_set[0]) };
1447         if ret < 0 {
1448             Err(VfioError::VfioIrqMask(get_error()))
1449         } else {
1450             Ok(())
1451         }
1452     }
1453 
1454     /// Get and validate VFIO device information.
get_device_info(device_file: &File) -> Result<(vfio_device_info, VfioDeviceType)>1455     fn get_device_info(device_file: &File) -> Result<(vfio_device_info, VfioDeviceType)> {
1456         let mut dev_info = vfio_device_info {
1457             argsz: mem::size_of::<vfio_device_info>() as u32,
1458             flags: 0,
1459             num_regions: 0,
1460             num_irqs: 0,
1461             ..Default::default()
1462         };
1463 
1464         // SAFETY:
1465         // Safe as we are the owner of device_file and dev_info which are valid value,
1466         // and we verify the return value.
1467         let ret = unsafe { ioctl_with_mut_ref(device_file, VFIO_DEVICE_GET_INFO, &mut dev_info) };
1468         if ret < 0 {
1469             return Err(VfioError::VfioDeviceGetInfo(get_error()));
1470         }
1471 
1472         let dev_type = if (dev_info.flags & VFIO_DEVICE_FLAGS_PCI) != 0 {
1473             if dev_info.num_regions < VFIO_PCI_CONFIG_REGION_INDEX + 1
1474                 || dev_info.num_irqs < VFIO_PCI_MSIX_IRQ_INDEX + 1
1475             {
1476                 return Err(VfioError::VfioDeviceGetInfo(get_error()));
1477             }
1478 
1479             VfioDeviceType::Pci
1480         } else if (dev_info.flags & VFIO_DEVICE_FLAGS_PLATFORM) != 0 {
1481             VfioDeviceType::Platform
1482         } else {
1483             return Err(VfioError::UnknownDeviceType(dev_info.flags));
1484         };
1485 
1486         Ok((dev_info, dev_type))
1487     }
1488 
1489     /// Query interrupt information
1490     /// return: Vector of interrupts information, each of which contains flags and index
get_irqs(&self) -> Result<Vec<VfioIrq>>1491     pub fn get_irqs(&self) -> Result<Vec<VfioIrq>> {
1492         let mut irqs: Vec<VfioIrq> = Vec::new();
1493 
1494         for i in 0..self.num_irqs {
1495             let argsz = mem::size_of::<vfio_irq_info>() as u32;
1496             let mut irq_info = vfio_irq_info {
1497                 argsz,
1498                 flags: 0,
1499                 index: i,
1500                 count: 0,
1501             };
1502             // SAFETY:
1503             // Safe as we are the owner of dev and irq_info which are valid value,
1504             // and we verify the return value.
1505             let ret = unsafe {
1506                 ioctl_with_mut_ref(self.device_file(), VFIO_DEVICE_GET_IRQ_INFO, &mut irq_info)
1507             };
1508             if ret < 0 || irq_info.count != 1 {
1509                 return Err(VfioError::VfioDeviceGetInfo(get_error()));
1510             }
1511 
1512             let irq = VfioIrq {
1513                 flags: irq_info.flags,
1514                 index: irq_info.index,
1515             };
1516             irqs.push(irq);
1517         }
1518         Ok(irqs)
1519     }
1520 
1521     #[allow(clippy::cast_ptr_alignment)]
get_regions(dev: &File, num_regions: u32) -> Result<Vec<VfioRegion>>1522     fn get_regions(dev: &File, num_regions: u32) -> Result<Vec<VfioRegion>> {
1523         let mut regions: Vec<VfioRegion> = Vec::new();
1524         for i in 0..num_regions {
1525             let argsz = mem::size_of::<vfio_region_info>() as u32;
1526             let mut reg_info = vfio_region_info {
1527                 argsz,
1528                 flags: 0,
1529                 index: i,
1530                 cap_offset: 0,
1531                 size: 0,
1532                 offset: 0,
1533             };
1534             let ret =
1535                 // SAFETY:
1536                 // Safe as we are the owner of dev and reg_info which are valid value,
1537                 // and we verify the return value.
1538                 unsafe { ioctl_with_mut_ref(dev, VFIO_DEVICE_GET_REGION_INFO, &mut reg_info) };
1539             if ret < 0 {
1540                 continue;
1541             }
1542 
1543             let mut mmaps: Vec<vfio_region_sparse_mmap_area> = Vec::new();
1544             let mut cap_info: Option<(u32, u32)> = None;
1545             if reg_info.argsz > argsz {
1546                 let cap_len: usize = (reg_info.argsz - argsz) as usize;
1547                 let mut region_with_cap =
1548                     vec_with_array_field::<vfio_region_info_with_cap, u8>(cap_len);
1549                 region_with_cap[0].region_info.argsz = reg_info.argsz;
1550                 region_with_cap[0].region_info.flags = 0;
1551                 region_with_cap[0].region_info.index = i;
1552                 region_with_cap[0].region_info.cap_offset = 0;
1553                 region_with_cap[0].region_info.size = 0;
1554                 region_with_cap[0].region_info.offset = 0;
1555                 // SAFETY:
1556                 // Safe as we are the owner of dev and region_info which are valid value,
1557                 // and we verify the return value.
1558                 let ret = unsafe {
1559                     ioctl_with_mut_ref(
1560                         dev,
1561                         VFIO_DEVICE_GET_REGION_INFO,
1562                         &mut (region_with_cap[0].region_info),
1563                     )
1564                 };
1565                 if ret < 0 {
1566                     return Err(VfioError::VfioDeviceGetRegionInfo(get_error()));
1567                 }
1568 
1569                 if region_with_cap[0].region_info.flags & VFIO_REGION_INFO_FLAG_CAPS == 0 {
1570                     continue;
1571                 }
1572 
1573                 let cap_header_sz = mem::size_of::<vfio_info_cap_header>() as u32;
1574                 let mmap_cap_sz = mem::size_of::<vfio_region_info_cap_sparse_mmap>() as u32;
1575                 let mmap_area_sz = mem::size_of::<vfio_region_sparse_mmap_area>() as u32;
1576                 let type_cap_sz = mem::size_of::<vfio_region_info_cap_type>() as u32;
1577                 let region_info_sz = reg_info.argsz;
1578 
1579                 // region_with_cap[0].cap_info may contain many structures, like
1580                 // vfio_region_info_cap_sparse_mmap struct or vfio_region_info_cap_type struct.
1581                 // Both of them begin with vfio_info_cap_header, so we will get individual cap from
1582                 // vfio_into_cap_header.
1583                 // Go through all the cap structs.
1584                 let info_ptr = region_with_cap.as_ptr() as *mut u8;
1585                 let mut offset = region_with_cap[0].region_info.cap_offset;
1586                 while offset != 0 {
1587                     if offset + cap_header_sz > region_info_sz {
1588                         break;
1589                     }
1590                     // SAFETY:
1591                     // Safe, as cap_header struct is in this function allocated region_with_cap
1592                     // vec.
1593                     let cap_ptr = unsafe { info_ptr.offset(offset as isize) };
1594                     // SAFETY:
1595                     // Safe, as cap_header struct is in this function allocated region_with_cap
1596                     // vec.
1597                     let cap_header = unsafe { &*(cap_ptr as *const vfio_info_cap_header) };
1598                     if cap_header.id as u32 == VFIO_REGION_INFO_CAP_SPARSE_MMAP {
1599                         if offset + mmap_cap_sz > region_info_sz {
1600                             break;
1601                         }
1602                         // cap_ptr is vfio_region_info_cap_sparse_mmap here
1603                         let sparse_mmap =
1604                             // SAFETY:
1605                             // Safe, this vfio_region_info_cap_sparse_mmap is in this function
1606                             // allocated region_with_cap vec.
1607                             unsafe { &*(cap_ptr as *const vfio_region_info_cap_sparse_mmap) };
1608 
1609                         let area_num = sparse_mmap.nr_areas;
1610                         if offset + mmap_cap_sz + area_num * mmap_area_sz > region_info_sz {
1611                             break;
1612                         }
1613                         let areas =
1614                             // SAFETY:
1615                             // Safe, these vfio_region_sparse_mmap_area are in this function allocated
1616                             // region_with_cap vec.
1617                             unsafe { sparse_mmap.areas.as_slice(sparse_mmap.nr_areas as usize) };
1618                         for area in areas.iter() {
1619                             mmaps.push(*area);
1620                         }
1621                     } else if cap_header.id as u32 == VFIO_REGION_INFO_CAP_TYPE {
1622                         if offset + type_cap_sz > region_info_sz {
1623                             break;
1624                         }
1625                         // cap_ptr is vfio_region_info_cap_type here
1626                         let cap_type_info =
1627                             // SAFETY:
1628                             // Safe, this vfio_region_info_cap_type is in this function allocated
1629                             // region_with_cap vec
1630                             unsafe { &*(cap_ptr as *const vfio_region_info_cap_type) };
1631 
1632                         cap_info = Some((cap_type_info.type_, cap_type_info.subtype));
1633                     } else if cap_header.id as u32 == VFIO_REGION_INFO_CAP_MSIX_MAPPABLE {
1634                         mmaps.push(vfio_region_sparse_mmap_area {
1635                             offset: 0,
1636                             size: region_with_cap[0].region_info.size,
1637                         });
1638                     }
1639 
1640                     offset = cap_header.next;
1641                 }
1642             } else if reg_info.flags & VFIO_REGION_INFO_FLAG_MMAP != 0 {
1643                 mmaps.push(vfio_region_sparse_mmap_area {
1644                     offset: 0,
1645                     size: reg_info.size,
1646                 });
1647             }
1648 
1649             let region = VfioRegion {
1650                 flags: reg_info.flags,
1651                 size: reg_info.size,
1652                 offset: reg_info.offset,
1653                 mmaps,
1654                 cap_info,
1655             };
1656             regions.push(region);
1657         }
1658 
1659         Ok(regions)
1660     }
1661 
1662     /// get a region's flag
1663     /// the return's value may conatin:
1664     ///     VFIO_REGION_INFO_FLAG_READ:  region supports read
1665     ///     VFIO_REGION_INFO_FLAG_WRITE: region supports write
1666     ///     VFIO_REGION_INFO_FLAG_MMAP:  region supports mmap
1667     ///     VFIO_REGION_INFO_FLAG_CAPS:  region's info supports caps
get_region_flags(&self, index: usize) -> u321668     pub fn get_region_flags(&self, index: usize) -> u32 {
1669         match self.regions.get(index) {
1670             Some(v) => v.flags,
1671             None => {
1672                 warn!("get_region_flags() with invalid index: {}", index);
1673                 0
1674             }
1675         }
1676     }
1677 
1678     /// get a region's offset
1679     /// return: Region offset from the start of vfio device descriptor
get_region_offset(&self, index: usize) -> u641680     pub fn get_region_offset(&self, index: usize) -> u64 {
1681         match self.regions.get(index) {
1682             Some(v) => v.offset,
1683             None => {
1684                 warn!("get_region_offset with invalid index: {}", index);
1685                 0
1686             }
1687         }
1688     }
1689 
1690     /// get a region's size
1691     /// return: Region size from the start of vfio device descriptor
get_region_size(&self, index: usize) -> u641692     pub fn get_region_size(&self, index: usize) -> u64 {
1693         match self.regions.get(index) {
1694             Some(v) => v.size,
1695             None => {
1696                 warn!("get_region_size with invalid index: {}", index);
1697                 0
1698             }
1699         }
1700     }
1701 
1702     /// get a number of regions
1703     /// return: Number of regions of vfio device descriptor
get_region_count(&self) -> usize1704     pub fn get_region_count(&self) -> usize {
1705         self.regions.len()
1706     }
1707 
1708     /// get a region's mmap info vector
get_region_mmap(&self, index: usize) -> Vec<vfio_region_sparse_mmap_area>1709     pub fn get_region_mmap(&self, index: usize) -> Vec<vfio_region_sparse_mmap_area> {
1710         match self.regions.get(index) {
1711             Some(v) => v.mmaps.clone(),
1712             None => {
1713                 warn!("get_region_mmap with invalid index: {}", index);
1714                 Vec::new()
1715             }
1716         }
1717     }
1718 
1719     /// find the specified cap type in device regions
1720     /// Input:
1721     ///      type_:  cap type
1722     ///      sub_type: cap sub_type
1723     /// Output:
1724     ///     None: device doesn't have the specified cap type
1725     ///     Some((bar_index, region_size)): device has the specified cap type, return region's
1726     ///                                     index and size
get_cap_type_info(&self, type_: u32, sub_type: u32) -> Option<(u32, u64)>1727     pub fn get_cap_type_info(&self, type_: u32, sub_type: u32) -> Option<(u32, u64)> {
1728         for (index, region) in self.regions.iter().enumerate() {
1729             if let Some(cap_info) = &region.cap_info {
1730                 if cap_info.0 == type_ && cap_info.1 == sub_type {
1731                     return Some((index as u32, region.size));
1732                 }
1733             }
1734         }
1735 
1736         None
1737     }
1738 
1739     /// Returns file offset corresponding to the given `VfioRegionAddr`.
1740     /// The offset can be used when reading/writing the VFIO device's FD directly.
get_offset_for_addr(&self, addr: &VfioRegionAddr) -> Result<u64>1741     pub fn get_offset_for_addr(&self, addr: &VfioRegionAddr) -> Result<u64> {
1742         let region = self
1743             .regions
1744             .get(addr.index)
1745             .ok_or(VfioError::InvalidIndex(addr.index))?;
1746         Ok(region.offset + addr.addr)
1747     }
1748 
1749     /// Read region's data from VFIO device into buf
1750     /// index: region num
1751     /// buf: data destination and buf length is read size
1752     /// addr: offset in the region
region_read(&self, index: usize, buf: &mut [u8], addr: u64)1753     pub fn region_read(&self, index: usize, buf: &mut [u8], addr: u64) {
1754         let stub: &VfioRegion = self
1755             .regions
1756             .get(index)
1757             .unwrap_or_else(|| panic!("tried to read VFIO with an invalid index: {}", index));
1758 
1759         let size = buf.len() as u64;
1760         if size > stub.size || addr + size > stub.size {
1761             panic!(
1762                 "tried to read VFIO region with invalid arguments: index={}, addr=0x{:x}, size=0x{:x}",
1763                 index, addr, size
1764             );
1765         }
1766 
1767         self.dev
1768             .read_exact_at(buf, stub.offset + addr)
1769             .unwrap_or_else(|e| {
1770                 panic!(
1771                     "failed to read region: index={}, addr=0x{:x}, error={}",
1772                     index, addr, e
1773                 )
1774             });
1775     }
1776 
1777     /// Reads a value from the specified `VfioRegionAddr.addr` + `offset`.
region_read_from_addr<T: FromBytes>(&self, addr: &VfioRegionAddr, offset: u64) -> T1778     pub fn region_read_from_addr<T: FromBytes>(&self, addr: &VfioRegionAddr, offset: u64) -> T {
1779         let mut val = mem::MaybeUninit::zeroed();
1780         let buf =
1781             // SAFETY:
1782             // Safe because we have zero-initialized `size_of::<T>()` bytes.
1783             unsafe { slice::from_raw_parts_mut(val.as_mut_ptr() as *mut u8, mem::size_of::<T>()) };
1784         self.region_read(addr.index, buf, addr.addr + offset);
1785         // SAFETY:
1786         // Safe because any bit pattern is valid for a type that implements FromBytes.
1787         unsafe { val.assume_init() }
1788     }
1789 
1790     /// write the data from buf into a vfio device region
1791     /// index: region num
1792     /// buf: data src and buf length is write size
1793     /// addr: offset in the region
region_write(&self, index: usize, buf: &[u8], addr: u64)1794     pub fn region_write(&self, index: usize, buf: &[u8], addr: u64) {
1795         let stub: &VfioRegion = self
1796             .regions
1797             .get(index)
1798             .unwrap_or_else(|| panic!("tried to write VFIO with an invalid index: {}", index));
1799 
1800         let size = buf.len() as u64;
1801         if size > stub.size
1802             || addr + size > stub.size
1803             || (stub.flags & VFIO_REGION_INFO_FLAG_WRITE) == 0
1804         {
1805             panic!(
1806                 "tried to write VFIO region with invalid arguments: index={}, addr=0x{:x}, size=0x{:x}",
1807                 index, addr, size
1808             );
1809         }
1810 
1811         self.dev
1812             .write_all_at(buf, stub.offset + addr)
1813             .unwrap_or_else(|e| {
1814                 panic!(
1815                     "failed to write region: index={}, addr=0x{:x}, error={}",
1816                     index, addr, e
1817                 )
1818             });
1819     }
1820 
1821     /// Writes data into the specified `VfioRegionAddr.addr` + `offset`.
region_write_to_addr(&self, data: &[u8], addr: &VfioRegionAddr, offset: u64)1822     pub fn region_write_to_addr(&self, data: &[u8], addr: &VfioRegionAddr, offset: u64) {
1823         self.region_write(addr.index, data, addr.addr + offset);
1824     }
1825 
1826     /// get vfio device's descriptors which are passed into minijail process
keep_rds(&self) -> Vec<RawDescriptor>1827     pub fn keep_rds(&self) -> Vec<RawDescriptor> {
1828         vec![
1829             self.dev.as_raw_descriptor(),
1830             self.group_descriptor,
1831             self.container.lock().as_raw_descriptor(),
1832         ]
1833     }
1834 
1835     /// Add (iova, user_addr) map into vfio container iommu table
1836     /// # Safety
1837     ///
1838     /// The caller is responsible for determining the safety of the VFIO_IOMMU_MAP_DMA ioctl.
vfio_dma_map( &self, iova: u64, size: u64, user_addr: u64, write_en: bool, ) -> Result<()>1839     pub unsafe fn vfio_dma_map(
1840         &self,
1841         iova: u64,
1842         size: u64,
1843         user_addr: u64,
1844         write_en: bool,
1845     ) -> Result<()> {
1846         self.container
1847             .lock()
1848             .vfio_dma_map(iova, size, user_addr, write_en)
1849     }
1850 
1851     /// Remove (iova, user_addr) map from vfio container iommu table
vfio_dma_unmap(&self, iova: u64, size: u64) -> Result<()>1852     pub fn vfio_dma_unmap(&self, iova: u64, size: u64) -> Result<()> {
1853         self.container.lock().vfio_dma_unmap(iova, size)
1854     }
1855 
vfio_get_iommu_page_size_mask(&self) -> Result<u64>1856     pub fn vfio_get_iommu_page_size_mask(&self) -> Result<u64> {
1857         self.container.lock().vfio_get_iommu_page_size_mask()
1858     }
1859 
alloc_iova(&self, size: u64, align_size: u64, alloc: Alloc) -> Result<u64>1860     pub fn alloc_iova(&self, size: u64, align_size: u64, alloc: Alloc) -> Result<u64> {
1861         self.iova_alloc
1862             .lock()
1863             .allocate_with_align(size, alloc, "alloc_iova".to_owned(), align_size)
1864             .map_err(VfioError::Resources)
1865     }
1866 
get_iova(&self, alloc: &Alloc) -> Option<AddressRange>1867     pub fn get_iova(&self, alloc: &Alloc) -> Option<AddressRange> {
1868         self.iova_alloc.lock().get(alloc).map(|res| res.0)
1869     }
1870 
release_iova(&self, alloc: Alloc) -> Result<AddressRange>1871     pub fn release_iova(&self, alloc: Alloc) -> Result<AddressRange> {
1872         self.iova_alloc
1873             .lock()
1874             .release(alloc)
1875             .map_err(VfioError::Resources)
1876     }
1877 
get_max_addr(&self) -> u641878     pub fn get_max_addr(&self) -> u64 {
1879         self.iova_alloc.lock().get_max_addr()
1880     }
1881 
1882     /// Gets the vfio device backing `File`.
device_file(&self) -> &File1883     pub fn device_file(&self) -> &File {
1884         &self.dev
1885     }
1886 
1887     /// close vfio device
close(&self)1888     pub fn close(&self) {
1889         self.container.lock().remove_group(self.group_id, true);
1890     }
1891 }
1892 
1893 pub struct VfioPciConfig {
1894     device: Arc<VfioDevice>,
1895 }
1896 
1897 impl VfioPciConfig {
new(device: Arc<VfioDevice>) -> Self1898     pub fn new(device: Arc<VfioDevice>) -> Self {
1899         VfioPciConfig { device }
1900     }
1901 
read_config<T: IntoBytes + FromBytes>(&self, offset: u32) -> T1902     pub fn read_config<T: IntoBytes + FromBytes>(&self, offset: u32) -> T {
1903         let mut config = T::new_zeroed();
1904         self.device.region_read(
1905             VFIO_PCI_CONFIG_REGION_INDEX as usize,
1906             config.as_mut_bytes(),
1907             offset.into(),
1908         );
1909         config
1910     }
1911 
write_config<T: Immutable + IntoBytes>(&self, config: T, offset: u32)1912     pub fn write_config<T: Immutable + IntoBytes>(&self, config: T, offset: u32) {
1913         self.device.region_write(
1914             VFIO_PCI_CONFIG_REGION_INDEX as usize,
1915             config.as_bytes(),
1916             offset.into(),
1917         );
1918     }
1919 
1920     /// Set the VFIO device this config refers to as the bus master.
set_bus_master(&self)1921     pub fn set_bus_master(&self) {
1922         /// Constant definitions from `linux/pci_regs.h`.
1923         const PCI_COMMAND: u32 = 0x4;
1924         /// Enable bus mastering
1925         const PCI_COMMAND_MASTER: u16 = 0x4;
1926 
1927         let mut cmd: u16 = self.read_config(PCI_COMMAND);
1928 
1929         if cmd & PCI_COMMAND_MASTER != 0 {
1930             return;
1931         }
1932 
1933         cmd |= PCI_COMMAND_MASTER;
1934 
1935         self.write_config(cmd, PCI_COMMAND);
1936     }
1937 }
1938 
1939 impl AsRawDescriptor for VfioDevice {
as_raw_descriptor(&self) -> RawDescriptor1940     fn as_raw_descriptor(&self) -> RawDescriptor {
1941         self.dev.as_raw_descriptor()
1942     }
1943 }
1944