1 // Copyright 2019 The ChromiumOS Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 use std::cell::RefCell;
6 use std::collections::HashMap;
7 use std::ffi::CString;
8 use std::fs::File;
9 use std::fs::OpenOptions;
10 use std::io;
11 use std::mem;
12 use std::os::raw::c_ulong;
13 use std::os::unix::prelude::FileExt;
14 use std::path::Path;
15 use std::path::PathBuf;
16 use std::slice;
17 use std::sync::Arc;
18 use std::u32;
19
20 use base::error;
21 use base::ioctl;
22 use base::ioctl_with_mut_ptr;
23 use base::ioctl_with_mut_ref;
24 use base::ioctl_with_ptr;
25 use base::ioctl_with_ref;
26 use base::ioctl_with_val;
27 use base::warn;
28 use base::AsRawDescriptor;
29 use base::Error;
30 use base::Event;
31 use base::FromRawDescriptor;
32 use base::RawDescriptor;
33 use base::SafeDescriptor;
34 use data_model::vec_with_array_field;
35 use data_model::zerocopy_from_reader;
36 use hypervisor::DeviceKind;
37 use hypervisor::Vm;
38 use once_cell::sync::OnceCell;
39 use remain::sorted;
40 use resources::address_allocator::AddressAllocator;
41 use resources::AddressRange;
42 use resources::Alloc;
43 use resources::Error as ResourcesError;
44 use sync::Mutex;
45 use thiserror::Error;
46 use vfio_sys::*;
47 use vm_memory::MemoryRegionInformation;
48 use zerocopy::AsBytes;
49 use zerocopy::FromBytes;
50
51 use crate::IommuDevType;
52
53 #[sorted]
54 #[derive(Error, Debug)]
55 pub enum VfioError {
56 #[error("failed to borrow global vfio container")]
57 BorrowVfioContainer,
58 #[error("failed to duplicate VfioContainer")]
59 ContainerDupError,
60 #[error("failed to set container's IOMMU driver type as VfioType1V2: {0}")]
61 ContainerSetIOMMU(Error),
62 #[error("failed to create KVM vfio device: {0}")]
63 CreateVfioKvmDevice(Error),
64 #[error("failed to get Group Status: {0}")]
65 GetGroupStatus(Error),
66 #[error("failed to get vfio device fd: {0}")]
67 GroupGetDeviceFD(Error),
68 #[error("failed to add vfio group into vfio container: {0}")]
69 GroupSetContainer(Error),
70 #[error("group is inviable")]
71 GroupViable,
72 #[error("invalid region index: {0}")]
73 InvalidIndex(u32),
74 #[error("invalid file path")]
75 InvalidPath,
76 #[error("failed to add guest memory map into iommu table: {0}")]
77 IommuDmaMap(Error),
78 #[error("failed to remove guest memory map from iommu table: {0}")]
79 IommuDmaUnmap(Error),
80 #[error("failed to get IOMMU cap info from host")]
81 IommuGetCapInfo,
82 #[error("failed to get IOMMU info from host: {0}")]
83 IommuGetInfo(Error),
84 #[error("failed to set KVM vfio device's attribute: {0}")]
85 KvmSetDeviceAttr(Error),
86 #[error("AddressAllocator is unavailable")]
87 NoRescAlloc,
88 #[error("failed to open /dev/vfio/vfio container: {0}")]
89 OpenContainer(io::Error),
90 #[error("failed to open {1} group: {0}")]
91 OpenGroup(io::Error, String),
92 #[error("resources error: {0}")]
93 Resources(ResourcesError),
94 #[error("unknown vfio device type (flags: {0:#x})")]
95 UnknownDeviceType(u32),
96 #[error(
97 "vfio API version doesn't match with VFIO_API_VERSION defined in vfio_sys/src/vfio.rs"
98 )]
99 VfioApiVersion,
100 #[error("failed to get vfio device's info or info doesn't match: {0}")]
101 VfioDeviceGetInfo(Error),
102 #[error("failed to get vfio device's region info: {0}")]
103 VfioDeviceGetRegionInfo(Error),
104 #[error("failed to disable vfio deviece's irq: {0}")]
105 VfioIrqDisable(Error),
106 #[error("failed to enable vfio deviece's irq: {0}")]
107 VfioIrqEnable(Error),
108 #[error("failed to mask vfio deviece's irq: {0}")]
109 VfioIrqMask(Error),
110 #[error("failed to unmask vfio deviece's irq: {0}")]
111 VfioIrqUnmask(Error),
112 #[error("failed to enter vfio deviece's low power state: {0}")]
113 VfioPmLowPowerEnter(Error),
114 #[error("failed to exit vfio deviece's low power state: {0}")]
115 VfioPmLowPowerExit(Error),
116 #[error("container dones't support VfioType1V2 IOMMU driver type")]
117 VfioType1V2,
118 }
119
120 type Result<T> = std::result::Result<T, VfioError>;
121
get_error() -> Error122 fn get_error() -> Error {
123 Error::last()
124 }
125
126 static KVM_VFIO_FILE: OnceCell<SafeDescriptor> = OnceCell::new();
127
128 #[derive(Copy, Clone, Debug, PartialEq, Eq)]
129 pub enum VfioDeviceType {
130 Pci,
131 Platform,
132 }
133
134 enum KvmVfioGroupOps {
135 Add,
136 Delete,
137 }
138
139 #[repr(u32)]
140 enum IommuType {
141 Type1V2 = VFIO_TYPE1v2_IOMMU,
142 // ChromeOS specific vfio_iommu_type1 implementation that is optimized for
143 // small, dynamic mappings. For clients which create large, relatively
144 // static mappings, Type1V2 is still preferred.
145 //
146 // See crrev.com/c/3593528 for the implementation.
147 Type1ChromeOS = 100001,
148 }
149
150 // Hint as to whether IOMMU mappings will tend to be large and static or
151 // small and dynamic.
152 #[derive(PartialEq, Eq)]
153 enum IommuMappingHint {
154 Static,
155 Dynamic,
156 }
157
158 /// VfioContainer contain multi VfioGroup, and delegate an IOMMU domain table
159 pub struct VfioContainer {
160 container: File,
161 groups: HashMap<u32, Arc<Mutex<VfioGroup>>>,
162 }
163
extract_vfio_struct<T>(bytes: &[u8], offset: usize) -> T where T: FromBytes,164 fn extract_vfio_struct<T>(bytes: &[u8], offset: usize) -> T
165 where
166 T: FromBytes,
167 {
168 zerocopy_from_reader(&bytes[offset..(offset + mem::size_of::<T>())])
169 .expect("malformed kernel data")
170 }
171
172 const VFIO_API_VERSION: u8 = 0;
173 impl VfioContainer {
new() -> Result<Self>174 pub fn new() -> Result<Self> {
175 let container = OpenOptions::new()
176 .read(true)
177 .write(true)
178 .open("/dev/vfio/vfio")
179 .map_err(VfioError::OpenContainer)?;
180
181 Self::new_from_container(container)
182 }
183
184 // Construct a VfioContainer from an exist container file.
new_from_container(container: File) -> Result<Self>185 pub fn new_from_container(container: File) -> Result<Self> {
186 // Safe as file is vfio container descriptor and ioctl is defined by kernel.
187 let version = unsafe { ioctl(&container, VFIO_GET_API_VERSION()) };
188 if version as u8 != VFIO_API_VERSION {
189 return Err(VfioError::VfioApiVersion);
190 }
191
192 Ok(VfioContainer {
193 container,
194 groups: HashMap::new(),
195 })
196 }
197
is_group_set(&self, group_id: u32) -> bool198 fn is_group_set(&self, group_id: u32) -> bool {
199 self.groups.get(&group_id).is_some()
200 }
201
check_extension(&self, val: IommuType) -> bool202 fn check_extension(&self, val: IommuType) -> bool {
203 // Safe as file is vfio container and make sure val is valid.
204 let ret = unsafe { ioctl_with_val(self, VFIO_CHECK_EXTENSION(), val as c_ulong) };
205 ret == 1
206 }
207
set_iommu(&self, val: IommuType) -> i32208 fn set_iommu(&self, val: IommuType) -> i32 {
209 // Safe as file is vfio container and make sure val is valid.
210 unsafe { ioctl_with_val(self, VFIO_SET_IOMMU(), val as c_ulong) }
211 }
212
vfio_dma_map( &self, iova: u64, size: u64, user_addr: u64, write_en: bool, ) -> Result<()>213 pub unsafe fn vfio_dma_map(
214 &self,
215 iova: u64,
216 size: u64,
217 user_addr: u64,
218 write_en: bool,
219 ) -> Result<()> {
220 let mut dma_map = vfio_iommu_type1_dma_map {
221 argsz: mem::size_of::<vfio_iommu_type1_dma_map>() as u32,
222 flags: VFIO_DMA_MAP_FLAG_READ,
223 vaddr: user_addr,
224 iova,
225 size,
226 };
227
228 if write_en {
229 dma_map.flags |= VFIO_DMA_MAP_FLAG_WRITE;
230 }
231
232 let ret = ioctl_with_ref(self, VFIO_IOMMU_MAP_DMA(), &dma_map);
233 if ret != 0 {
234 return Err(VfioError::IommuDmaMap(get_error()));
235 }
236
237 Ok(())
238 }
239
vfio_dma_unmap(&self, iova: u64, size: u64) -> Result<()>240 pub fn vfio_dma_unmap(&self, iova: u64, size: u64) -> Result<()> {
241 let mut dma_unmap = vfio_iommu_type1_dma_unmap {
242 argsz: mem::size_of::<vfio_iommu_type1_dma_unmap>() as u32,
243 flags: 0,
244 iova,
245 size,
246 ..Default::default()
247 };
248
249 // Safe as file is vfio container, dma_unmap is constructed by us, and
250 // we check the return value
251 let ret = unsafe { ioctl_with_mut_ref(self, VFIO_IOMMU_UNMAP_DMA(), &mut dma_unmap) };
252 if ret != 0 || dma_unmap.size != size {
253 return Err(VfioError::IommuDmaUnmap(get_error()));
254 }
255
256 Ok(())
257 }
258
vfio_get_iommu_page_size_mask(&self) -> Result<u64>259 pub fn vfio_get_iommu_page_size_mask(&self) -> Result<u64> {
260 let mut iommu_info = vfio_iommu_type1_info {
261 argsz: mem::size_of::<vfio_iommu_type1_info>() as u32,
262 flags: 0,
263 iova_pgsizes: 0,
264 ..Default::default()
265 };
266
267 // Safe as file is vfio container, iommu_info has valid values,
268 // and we check the return value
269 let ret = unsafe { ioctl_with_mut_ref(self, VFIO_IOMMU_GET_INFO(), &mut iommu_info) };
270 if ret != 0 || (iommu_info.flags & VFIO_IOMMU_INFO_PGSIZES) == 0 {
271 return Err(VfioError::IommuGetInfo(get_error()));
272 }
273
274 Ok(iommu_info.iova_pgsizes)
275 }
276
vfio_iommu_iova_get_iova_ranges(&self) -> Result<Vec<AddressRange>>277 pub fn vfio_iommu_iova_get_iova_ranges(&self) -> Result<Vec<AddressRange>> {
278 // Query the buffer size needed fetch the capabilities.
279 let mut iommu_info_argsz = vfio_iommu_type1_info {
280 argsz: mem::size_of::<vfio_iommu_type1_info>() as u32,
281 flags: 0,
282 iova_pgsizes: 0,
283 ..Default::default()
284 };
285
286 // Safe as file is vfio container, iommu_info_argsz has valid values,
287 // and we check the return value
288 let ret = unsafe { ioctl_with_mut_ref(self, VFIO_IOMMU_GET_INFO(), &mut iommu_info_argsz) };
289 if ret != 0 {
290 return Err(VfioError::IommuGetInfo(get_error()));
291 }
292
293 if (iommu_info_argsz.flags & VFIO_IOMMU_INFO_CAPS) == 0 {
294 return Err(VfioError::IommuGetCapInfo);
295 }
296
297 let mut iommu_info = vec_with_array_field::<vfio_iommu_type1_info, u8>(
298 iommu_info_argsz.argsz as usize - mem::size_of::<vfio_iommu_type1_info>(),
299 );
300 iommu_info[0].argsz = iommu_info_argsz.argsz;
301 // Safe as file is vfio container, iommu_info has valid values,
302 // and we check the return value
303 let ret =
304 unsafe { ioctl_with_mut_ptr(self, VFIO_IOMMU_GET_INFO(), iommu_info.as_mut_ptr()) };
305 if ret != 0 {
306 return Err(VfioError::IommuGetInfo(get_error()));
307 }
308
309 // Safe because we initialized iommu_info with enough space, u8 has less strict
310 // alignment, and since it will no longer be mutated.
311 let info_bytes = unsafe {
312 std::slice::from_raw_parts(
313 iommu_info.as_ptr() as *const u8,
314 iommu_info_argsz.argsz as usize,
315 )
316 };
317
318 if (iommu_info[0].flags & VFIO_IOMMU_INFO_CAPS) == 0 {
319 return Err(VfioError::IommuGetCapInfo);
320 }
321
322 let mut offset = iommu_info[0].cap_offset as usize;
323 while offset != 0 {
324 let header = extract_vfio_struct::<vfio_info_cap_header>(info_bytes, offset);
325
326 if header.id == VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE as u16 && header.version == 1 {
327 let iova_header = extract_vfio_struct::<vfio_iommu_type1_info_cap_iova_range_header>(
328 info_bytes, offset,
329 );
330 let range_offset = offset + mem::size_of::<vfio_iommu_type1_info_cap_iova_range>();
331 let mut ret = Vec::new();
332 for i in 0..iova_header.nr_iovas {
333 ret.push(extract_vfio_struct::<vfio_iova_range>(
334 info_bytes,
335 range_offset + i as usize * mem::size_of::<vfio_iova_range>(),
336 ));
337 }
338 return Ok(ret
339 .iter()
340 .map(|range| AddressRange {
341 start: range.start,
342 end: range.end,
343 })
344 .collect());
345 }
346 offset = header.next as usize;
347 }
348
349 Err(VfioError::IommuGetCapInfo)
350 }
351
init_vfio_iommu(&mut self, hint: IommuMappingHint) -> Result<()>352 fn init_vfio_iommu(&mut self, hint: IommuMappingHint) -> Result<()> {
353 // If we expect granular, dynamic mappings (i.e. viommu/coiommu), try the
354 // ChromeOS Type1ChromeOS first, then fall back to upstream versions.
355 if hint == IommuMappingHint::Dynamic {
356 if self.set_iommu(IommuType::Type1ChromeOS) == 0 {
357 return Ok(());
358 }
359 }
360
361 if !self.check_extension(IommuType::Type1V2) {
362 return Err(VfioError::VfioType1V2);
363 }
364
365 if self.set_iommu(IommuType::Type1V2) < 0 {
366 return Err(VfioError::ContainerSetIOMMU(get_error()));
367 }
368
369 Ok(())
370 }
371
get_group_with_vm( &mut self, id: u32, vm: &impl Vm, iommu_enabled: bool, ) -> Result<Arc<Mutex<VfioGroup>>>372 fn get_group_with_vm(
373 &mut self,
374 id: u32,
375 vm: &impl Vm,
376 iommu_enabled: bool,
377 ) -> Result<Arc<Mutex<VfioGroup>>> {
378 match self.groups.get(&id) {
379 Some(group) => Ok(group.clone()),
380 None => {
381 let group = Arc::new(Mutex::new(VfioGroup::new(self, id)?));
382 if self.groups.is_empty() {
383 // Before the first group is added into container, do once per container
384 // initialization. Both coiommu and virtio-iommu rely on small, dynamic
385 // mappings. However, if an iommu is not enabled, then we map the entirety
386 // of guest memory as a small number of large, static mappings.
387 let mapping_hint = if iommu_enabled {
388 IommuMappingHint::Dynamic
389 } else {
390 IommuMappingHint::Static
391 };
392 self.init_vfio_iommu(mapping_hint)?;
393
394 if !iommu_enabled {
395 vm.get_memory().with_regions(
396 |MemoryRegionInformation {
397 guest_addr,
398 size,
399 host_addr,
400 ..
401 }| {
402 // Safe because the guest regions are guaranteed not to overlap
403 unsafe {
404 self.vfio_dma_map(
405 guest_addr.0,
406 size as u64,
407 host_addr as u64,
408 true,
409 )
410 }
411 },
412 )?;
413 }
414 }
415
416 let kvm_vfio_file = KVM_VFIO_FILE
417 .get_or_try_init(|| vm.create_device(DeviceKind::Vfio))
418 .map_err(VfioError::CreateVfioKvmDevice)?;
419 group
420 .lock()
421 .kvm_device_set_group(kvm_vfio_file, KvmVfioGroupOps::Add)?;
422
423 self.groups.insert(id, group.clone());
424
425 Ok(group)
426 }
427 }
428 }
429
get_group(&mut self, id: u32) -> Result<Arc<Mutex<VfioGroup>>>430 fn get_group(&mut self, id: u32) -> Result<Arc<Mutex<VfioGroup>>> {
431 match self.groups.get(&id) {
432 Some(group) => Ok(group.clone()),
433 None => {
434 let group = Arc::new(Mutex::new(VfioGroup::new(self, id)?));
435
436 if self.groups.is_empty() {
437 // Before the first group is added into container, do once per
438 // container initialization.
439 self.init_vfio_iommu(IommuMappingHint::Static)?;
440 }
441
442 self.groups.insert(id, group.clone());
443 Ok(group)
444 }
445 }
446 }
447
remove_group(&mut self, id: u32, reduce: bool)448 fn remove_group(&mut self, id: u32, reduce: bool) {
449 let mut remove = false;
450
451 if let Some(group) = self.groups.get(&id) {
452 if reduce {
453 group.lock().reduce_device_num();
454 }
455 if group.lock().device_num() == 0 {
456 let kvm_vfio_file = KVM_VFIO_FILE.get().expect("kvm vfio file isn't created");
457 if group
458 .lock()
459 .kvm_device_set_group(kvm_vfio_file, KvmVfioGroupOps::Delete)
460 .is_err()
461 {
462 warn!("failing in remove vfio group from kvm device");
463 }
464 remove = true;
465 }
466 }
467
468 if remove {
469 self.groups.remove(&id);
470 }
471 }
472
clone_as_raw_descriptor(&self) -> Result<RawDescriptor>473 pub fn clone_as_raw_descriptor(&self) -> Result<RawDescriptor> {
474 let raw_descriptor = unsafe { libc::dup(self.container.as_raw_descriptor()) };
475 if raw_descriptor < 0 {
476 Err(VfioError::ContainerDupError)
477 } else {
478 Ok(raw_descriptor)
479 }
480 }
481
482 // Gets group ids for all groups in the container.
group_ids(&self) -> Vec<&u32>483 pub fn group_ids(&self) -> Vec<&u32> {
484 self.groups.keys().collect()
485 }
486 }
487
488 impl AsRawDescriptor for VfioContainer {
as_raw_descriptor(&self) -> RawDescriptor489 fn as_raw_descriptor(&self) -> RawDescriptor {
490 self.container.as_raw_descriptor()
491 }
492 }
493
494 struct VfioGroup {
495 group: File,
496 device_num: u32,
497 }
498
499 impl VfioGroup {
new(container: &VfioContainer, id: u32) -> Result<Self>500 fn new(container: &VfioContainer, id: u32) -> Result<Self> {
501 let group_path = format!("/dev/vfio/{}", id);
502 let group_file = OpenOptions::new()
503 .read(true)
504 .write(true)
505 .open(Path::new(&group_path))
506 .map_err(|e| VfioError::OpenGroup(e, group_path))?;
507
508 let mut group_status = vfio_group_status {
509 argsz: mem::size_of::<vfio_group_status>() as u32,
510 flags: 0,
511 };
512 // Safe as we are the owner of group_file and group_status which are valid value.
513 let mut ret =
514 unsafe { ioctl_with_mut_ref(&group_file, VFIO_GROUP_GET_STATUS(), &mut group_status) };
515 if ret < 0 {
516 return Err(VfioError::GetGroupStatus(get_error()));
517 }
518
519 if group_status.flags != VFIO_GROUP_FLAGS_VIABLE {
520 return Err(VfioError::GroupViable);
521 }
522
523 // Safe as we are the owner of group_file and container_raw_descriptor which are valid value,
524 // and we verify the ret value
525 let container_raw_descriptor = container.as_raw_descriptor();
526 ret = unsafe {
527 ioctl_with_ref(
528 &group_file,
529 VFIO_GROUP_SET_CONTAINER(),
530 &container_raw_descriptor,
531 )
532 };
533 if ret < 0 {
534 return Err(VfioError::GroupSetContainer(get_error()));
535 }
536
537 Ok(VfioGroup {
538 group: group_file,
539 device_num: 0,
540 })
541 }
542
get_group_id<P: AsRef<Path>>(sysfspath: P) -> Result<u32>543 fn get_group_id<P: AsRef<Path>>(sysfspath: P) -> Result<u32> {
544 let mut uuid_path = PathBuf::new();
545 uuid_path.push(sysfspath);
546 uuid_path.push("iommu_group");
547 let group_path = uuid_path.read_link().map_err(|_| VfioError::InvalidPath)?;
548 let group_osstr = group_path.file_name().ok_or(VfioError::InvalidPath)?;
549 let group_str = group_osstr.to_str().ok_or(VfioError::InvalidPath)?;
550 let group_id = group_str
551 .parse::<u32>()
552 .map_err(|_| VfioError::InvalidPath)?;
553
554 Ok(group_id)
555 }
556
kvm_device_set_group( &self, kvm_vfio_file: &SafeDescriptor, ops: KvmVfioGroupOps, ) -> Result<()>557 fn kvm_device_set_group(
558 &self,
559 kvm_vfio_file: &SafeDescriptor,
560 ops: KvmVfioGroupOps,
561 ) -> Result<()> {
562 let group_descriptor = self.as_raw_descriptor();
563 let group_descriptor_ptr = &group_descriptor as *const i32;
564 let vfio_dev_attr = match ops {
565 KvmVfioGroupOps::Add => kvm_sys::kvm_device_attr {
566 flags: 0,
567 group: kvm_sys::KVM_DEV_VFIO_GROUP,
568 attr: kvm_sys::KVM_DEV_VFIO_GROUP_ADD as u64,
569 addr: group_descriptor_ptr as u64,
570 },
571 KvmVfioGroupOps::Delete => kvm_sys::kvm_device_attr {
572 flags: 0,
573 group: kvm_sys::KVM_DEV_VFIO_GROUP,
574 attr: kvm_sys::KVM_DEV_VFIO_GROUP_DEL as u64,
575 addr: group_descriptor_ptr as u64,
576 },
577 };
578
579 // Safe as we are the owner of vfio_dev_descriptor and vfio_dev_attr which are valid value,
580 // and we verify the return value.
581 if 0 != unsafe {
582 ioctl_with_ref(
583 kvm_vfio_file,
584 kvm_sys::KVM_SET_DEVICE_ATTR(),
585 &vfio_dev_attr,
586 )
587 } {
588 return Err(VfioError::KvmSetDeviceAttr(get_error()));
589 }
590
591 Ok(())
592 }
593
get_device(&self, name: &str) -> Result<File>594 fn get_device(&self, name: &str) -> Result<File> {
595 let path: CString = CString::new(name.as_bytes()).expect("CString::new() failed");
596 let path_ptr = path.as_ptr();
597
598 // Safe as we are the owner of self and path_ptr which are valid value.
599 let ret = unsafe { ioctl_with_ptr(self, VFIO_GROUP_GET_DEVICE_FD(), path_ptr) };
600 if ret < 0 {
601 return Err(VfioError::GroupGetDeviceFD(get_error()));
602 }
603
604 // Safe as ret is valid descriptor
605 Ok(unsafe { File::from_raw_descriptor(ret) })
606 }
607
add_device_num(&mut self)608 fn add_device_num(&mut self) {
609 self.device_num += 1;
610 }
611
reduce_device_num(&mut self)612 fn reduce_device_num(&mut self) {
613 self.device_num -= 1;
614 }
615
device_num(&self) -> u32616 fn device_num(&self) -> u32 {
617 self.device_num
618 }
619 }
620
621 impl AsRawDescriptor for VfioGroup {
as_raw_descriptor(&self) -> RawDescriptor622 fn as_raw_descriptor(&self) -> RawDescriptor {
623 self.group.as_raw_descriptor()
624 }
625 }
626
627 /// A helper trait for managing VFIO setup
628 pub trait VfioCommonTrait: Send + Sync {
629 /// The single place to create a VFIO container for a PCI endpoint.
630 ///
631 /// The policy to determine whether an individual or a shared VFIO container
632 /// will be created for this device is governed by the physical PCI topology,
633 /// and the argument iommu_enabled.
634 ///
635 /// # Arguments
636 ///
637 /// * `sysfspath` - the path to the PCI device, e.g. /sys/bus/pci/devices/0000:02:00.0
638 /// * `iommu_enabled` - whether virtio IOMMU is enabled on this device
vfio_get_container<P: AsRef<Path>>( iommu_dev: IommuDevType, sysfspath: Option<P>, ) -> Result<Arc<Mutex<VfioContainer>>>639 fn vfio_get_container<P: AsRef<Path>>(
640 iommu_dev: IommuDevType,
641 sysfspath: Option<P>,
642 ) -> Result<Arc<Mutex<VfioContainer>>>;
643 }
644
645 thread_local! {
646
647 // One VFIO container is shared by all VFIO devices that don't
648 // attach to the virtio IOMMU device
649 static NO_IOMMU_CONTAINER: RefCell<Option<Arc<Mutex<VfioContainer>>>> = RefCell::new(None);
650
651 // For IOMMU enabled devices, all VFIO groups that share the same IOVA space
652 // are managed by one VFIO container
653 static IOMMU_CONTAINERS: RefCell<Option<Vec<Arc<Mutex<VfioContainer>>>>> = RefCell::new(Some(Default::default()));
654
655 // One VFIO container is shared by all VFIO devices that
656 // attach to the CoIOMMU device
657 static COIOMMU_CONTAINER: RefCell<Option<Arc<Mutex<VfioContainer>>>> = RefCell::new(None);
658 }
659
660 pub struct VfioCommonSetup;
661
662 impl VfioCommonTrait for VfioCommonSetup {
vfio_get_container<P: AsRef<Path>>( iommu_dev: IommuDevType, sysfspath: Option<P>, ) -> Result<Arc<Mutex<VfioContainer>>>663 fn vfio_get_container<P: AsRef<Path>>(
664 iommu_dev: IommuDevType,
665 sysfspath: Option<P>,
666 ) -> Result<Arc<Mutex<VfioContainer>>> {
667 match iommu_dev {
668 IommuDevType::NoIommu => {
669 // One VFIO container is used for all IOMMU disabled groups
670 NO_IOMMU_CONTAINER.with(|v| {
671 if v.borrow().is_some() {
672 if let Some(ref container) = *v.borrow() {
673 Ok(container.clone())
674 } else {
675 Err(VfioError::BorrowVfioContainer)
676 }
677 } else {
678 let container = Arc::new(Mutex::new(VfioContainer::new()?));
679 *v.borrow_mut() = Some(container.clone());
680 Ok(container)
681 }
682 })
683 }
684 IommuDevType::VirtioIommu => {
685 let path = sysfspath.ok_or(VfioError::InvalidPath)?;
686 let group_id = VfioGroup::get_group_id(path)?;
687
688 // One VFIO container is used for all devices belong to one VFIO group
689 // NOTE: vfio_wrapper relies on each container containing exactly one group.
690 IOMMU_CONTAINERS.with(|v| {
691 if let Some(ref mut containers) = *v.borrow_mut() {
692 let container = containers
693 .iter()
694 .find(|container| container.lock().is_group_set(group_id));
695
696 match container {
697 None => {
698 let container = Arc::new(Mutex::new(VfioContainer::new()?));
699 containers.push(container.clone());
700 Ok(container)
701 }
702 Some(container) => Ok(container.clone()),
703 }
704 } else {
705 Err(VfioError::BorrowVfioContainer)
706 }
707 })
708 }
709 IommuDevType::CoIommu => {
710 // One VFIO container is used for devices attached to CoIommu
711 COIOMMU_CONTAINER.with(|v| {
712 if v.borrow().is_some() {
713 if let Some(ref container) = *v.borrow() {
714 Ok(container.clone())
715 } else {
716 Err(VfioError::BorrowVfioContainer)
717 }
718 } else {
719 let container = Arc::new(Mutex::new(VfioContainer::new()?));
720 *v.borrow_mut() = Some(container.clone());
721 Ok(container)
722 }
723 })
724 }
725 }
726 }
727 }
728
729 /// Vfio Irq type used to enable/disable/mask/unmask vfio irq
730 pub enum VfioIrqType {
731 Intx,
732 Msi,
733 Msix,
734 }
735
736 /// Vfio Irq information used to assign and enable/disable/mask/unmask vfio irq
737 pub struct VfioIrq {
738 pub flags: u32,
739 pub index: u32,
740 }
741
742 /// Address on VFIO memory region.
743 #[derive(Debug, Default, Clone)]
744 pub struct VfioRegionAddr {
745 /// region number.
746 pub index: u32,
747 /// offset in the region.
748 pub addr: u64,
749 }
750
751 #[derive(Debug)]
752 pub struct VfioRegion {
753 // flags for this region: read/write/mmap
754 flags: u32,
755 size: u64,
756 // region offset used to read/write with vfio device descriptor
757 offset: u64,
758 // vectors for mmap offset and size
759 mmaps: Vec<vfio_region_sparse_mmap_area>,
760 // type and subtype for cap type
761 cap_info: Option<(u32, u32)>,
762 }
763
764 /// Vfio device for exposing regions which could be read/write to kernel vfio device.
765 pub struct VfioDevice {
766 dev: File,
767 name: String,
768 container: Arc<Mutex<VfioContainer>>,
769 dev_type: VfioDeviceType,
770 group_descriptor: RawDescriptor,
771 group_id: u32,
772 // vec for vfio device's regions
773 regions: Vec<VfioRegion>,
774 num_irqs: u32,
775
776 iova_alloc: Arc<Mutex<AddressAllocator>>,
777 }
778
779 impl VfioDevice {
780 /// Create a new vfio device, then guest read/write on this device could be
781 /// transfered into kernel vfio.
782 /// sysfspath specify the vfio device path in sys file system.
new_passthrough<P: AsRef<Path>>( sysfspath: &P, vm: &impl Vm, container: Arc<Mutex<VfioContainer>>, iommu_enabled: bool, ) -> Result<Self>783 pub fn new_passthrough<P: AsRef<Path>>(
784 sysfspath: &P,
785 vm: &impl Vm,
786 container: Arc<Mutex<VfioContainer>>,
787 iommu_enabled: bool,
788 ) -> Result<Self> {
789 let group_id = VfioGroup::get_group_id(sysfspath)?;
790
791 let group = container
792 .lock()
793 .get_group_with_vm(group_id, vm, iommu_enabled)?;
794 let name_osstr = sysfspath
795 .as_ref()
796 .file_name()
797 .ok_or(VfioError::InvalidPath)?;
798 let name_str = name_osstr.to_str().ok_or(VfioError::InvalidPath)?;
799 let name = String::from(name_str);
800 let dev = group.lock().get_device(&name)?;
801 let (dev_info, dev_type) = Self::get_device_info(&dev)?;
802 let regions = Self::get_regions(&dev, dev_info.num_regions)?;
803 group.lock().add_device_num();
804 let group_descriptor = group.lock().as_raw_descriptor();
805
806 let iova_ranges = container.lock().vfio_iommu_iova_get_iova_ranges()?;
807 let iova_alloc = AddressAllocator::new_from_list(iova_ranges, None, None)
808 .map_err(VfioError::Resources)?;
809
810 Ok(VfioDevice {
811 dev,
812 name,
813 container,
814 dev_type,
815 group_descriptor,
816 group_id,
817 regions,
818 num_irqs: dev_info.num_irqs,
819 iova_alloc: Arc::new(Mutex::new(iova_alloc)),
820 })
821 }
822
new<P: AsRef<Path>>( sysfspath: &P, container: Arc<Mutex<VfioContainer>>, ) -> Result<Self>823 pub fn new<P: AsRef<Path>>(
824 sysfspath: &P,
825 container: Arc<Mutex<VfioContainer>>,
826 ) -> Result<Self> {
827 let group_id = VfioGroup::get_group_id(sysfspath)?;
828 let group = container.lock().get_group(group_id)?;
829 let name_osstr = sysfspath
830 .as_ref()
831 .file_name()
832 .ok_or(VfioError::InvalidPath)?;
833 let name_str = name_osstr.to_str().ok_or(VfioError::InvalidPath)?;
834 let name = String::from(name_str);
835
836 let dev = match group.lock().get_device(&name) {
837 Ok(dev) => dev,
838 Err(e) => {
839 container.lock().remove_group(group_id, false);
840 return Err(e);
841 }
842 };
843 let (dev_info, dev_type) = match Self::get_device_info(&dev) {
844 Ok(dev_info) => dev_info,
845 Err(e) => {
846 container.lock().remove_group(group_id, false);
847 return Err(e);
848 }
849 };
850 let regions = match Self::get_regions(&dev, dev_info.num_regions) {
851 Ok(regions) => regions,
852 Err(e) => {
853 container.lock().remove_group(group_id, false);
854 return Err(e);
855 }
856 };
857 group.lock().add_device_num();
858 let group_descriptor = group.lock().as_raw_descriptor();
859
860 let iova_ranges = container.lock().vfio_iommu_iova_get_iova_ranges()?;
861 let iova_alloc = AddressAllocator::new_from_list(iova_ranges, None, None)
862 .map_err(VfioError::Resources)?;
863
864 Ok(VfioDevice {
865 dev,
866 name,
867 container,
868 dev_type,
869 group_descriptor,
870 group_id,
871 regions,
872 num_irqs: dev_info.num_irqs,
873 iova_alloc: Arc::new(Mutex::new(iova_alloc)),
874 })
875 }
876
877 /// Returns the file for this device.
dev_file(&self) -> &File878 pub fn dev_file(&self) -> &File {
879 &self.dev
880 }
881
882 /// Returns PCI device name, formatted as BUS:DEVICE.FUNCTION string.
device_name(&self) -> &String883 pub fn device_name(&self) -> &String {
884 &self.name
885 }
886
887 /// Returns the type of this VFIO device.
device_type(&self) -> VfioDeviceType888 pub fn device_type(&self) -> VfioDeviceType {
889 self.dev_type
890 }
891
892 /// enter the device's low power state
pm_low_power_enter(&self) -> Result<()>893 pub fn pm_low_power_enter(&self) -> Result<()> {
894 let mut device_feature = vec_with_array_field::<vfio_device_feature, u8>(0);
895 device_feature[0].argsz = mem::size_of::<vfio_device_feature>() as u32;
896 device_feature[0].flags = VFIO_DEVICE_FEATURE_SET | VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY;
897 // Safe as we are the owner of self and power_management which are valid value
898 let ret = unsafe { ioctl_with_ref(&self.dev, VFIO_DEVICE_FEATURE(), &device_feature[0]) };
899 if ret < 0 {
900 Err(VfioError::VfioPmLowPowerEnter(get_error()))
901 } else {
902 Ok(())
903 }
904 }
905
906 /// enter the device's low power state with wakeup notification
pm_low_power_enter_with_wakeup(&self, wakeup_evt: Event) -> Result<()>907 pub fn pm_low_power_enter_with_wakeup(&self, wakeup_evt: Event) -> Result<()> {
908 let payload = vfio_device_low_power_entry_with_wakeup {
909 wakeup_eventfd: wakeup_evt.as_raw_descriptor(),
910 reserved: 0,
911 };
912 let payload_size = mem::size_of::<vfio_device_low_power_entry_with_wakeup>();
913 let mut device_feature = vec_with_array_field::<vfio_device_feature, u8>(payload_size);
914 device_feature[0].argsz = (mem::size_of::<vfio_device_feature>() + payload_size) as u32;
915 device_feature[0].flags =
916 VFIO_DEVICE_FEATURE_SET | VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY_WITH_WAKEUP;
917 unsafe {
918 // Safe as we know vfio_device_low_power_entry_with_wakeup has two 32-bit int fields
919 device_feature[0]
920 .data
921 .as_mut_slice(payload_size)
922 .copy_from_slice(
923 mem::transmute::<vfio_device_low_power_entry_with_wakeup, [u8; 8]>(payload)
924 .as_slice(),
925 );
926 }
927 // Safe as we are the owner of self and power_management which are valid value
928 let ret = unsafe { ioctl_with_ref(&self.dev, VFIO_DEVICE_FEATURE(), &device_feature[0]) };
929 if ret < 0 {
930 Err(VfioError::VfioPmLowPowerEnter(get_error()))
931 } else {
932 Ok(())
933 }
934 }
935
936 /// exit the device's low power state
pm_low_power_exit(&self) -> Result<()>937 pub fn pm_low_power_exit(&self) -> Result<()> {
938 let mut device_feature = vec_with_array_field::<vfio_device_feature, u8>(0);
939 device_feature[0].argsz = mem::size_of::<vfio_device_feature>() as u32;
940 device_feature[0].flags = VFIO_DEVICE_FEATURE_SET | VFIO_DEVICE_FEATURE_LOW_POWER_EXIT;
941 // Safe as we are the owner of self and power_management which are valid value
942 let ret = unsafe { ioctl_with_ref(&self.dev, VFIO_DEVICE_FEATURE(), &device_feature[0]) };
943 if ret < 0 {
944 Err(VfioError::VfioPmLowPowerExit(get_error()))
945 } else {
946 Ok(())
947 }
948 }
949
950 /// Enable vfio device's irq and associate Irqfd Event with device.
951 /// When MSIx is enabled, multi vectors will be supported, and vectors starting from subindex to subindex +
952 /// descriptors length will be assigned with irqfd in the descriptors array.
953 /// when index = VFIO_PCI_REQ_IRQ_INDEX, kernel vfio will trigger this event when physical device
954 /// is removed.
955 /// If descriptor is None, -1 is assigned to the irq. A value of -1 is used to either de-assign
956 /// interrupts if already assigned or skip un-assigned interrupts.
irq_enable( &self, descriptors: &[Option<&Event>], index: u32, subindex: u32, ) -> Result<()>957 pub fn irq_enable(
958 &self,
959 descriptors: &[Option<&Event>],
960 index: u32,
961 subindex: u32,
962 ) -> Result<()> {
963 let count = descriptors.len();
964 let u32_size = mem::size_of::<u32>();
965 let mut irq_set = vec_with_array_field::<vfio_irq_set, u32>(count);
966 irq_set[0].argsz = (mem::size_of::<vfio_irq_set>() + count * u32_size) as u32;
967 irq_set[0].flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER;
968 irq_set[0].index = index;
969 irq_set[0].start = subindex;
970 irq_set[0].count = count as u32;
971
972 // irq_set.data could be none, bool or descriptor according to flags, so irq_set.data
973 // is u8 default, here irq_set.data is descriptor as u32, so 4 default u8 are combined
974 // together as u32. It is safe as enough space is reserved through
975 // vec_with_array_field(u32)<count>.
976 let mut data = unsafe { irq_set[0].data.as_mut_slice(count * u32_size) };
977 for descriptor in descriptors.iter().take(count) {
978 let (left, right) = data.split_at_mut(u32_size);
979 match descriptor {
980 Some(fd) => left.copy_from_slice(&fd.as_raw_descriptor().to_ne_bytes()[..]),
981 None => left.copy_from_slice(&(-1i32).to_ne_bytes()[..]),
982 }
983 data = right;
984 }
985
986 // Safe as we are the owner of self and irq_set which are valid value
987 let ret = unsafe { ioctl_with_ref(&self.dev, VFIO_DEVICE_SET_IRQS(), &irq_set[0]) };
988 if ret < 0 {
989 Err(VfioError::VfioIrqEnable(get_error()))
990 } else {
991 Ok(())
992 }
993 }
994
995 /// When intx is enabled, irqfd is used to trigger a level interrupt into guest, resample irqfd
996 /// is used to get guest EOI notification.
997 /// When host hw generates interrupt, vfio irq handler in host kernel receive and handle it,
998 /// this handler disable hw irq first, then trigger irqfd to inject interrupt into guest. When
999 /// resample irqfd is triggered by guest EOI, vfio kernel could enable hw irq, so hw could
1000 /// generate another interrupts.
1001 /// This function enable resample irqfd and let vfio kernel could get EOI notification.
1002 ///
1003 /// descriptor: should be resample IrqFd.
resample_virq_enable(&self, descriptor: &Event, index: u32) -> Result<()>1004 pub fn resample_virq_enable(&self, descriptor: &Event, index: u32) -> Result<()> {
1005 let mut irq_set = vec_with_array_field::<vfio_irq_set, u32>(1);
1006 irq_set[0].argsz = (mem::size_of::<vfio_irq_set>() + mem::size_of::<u32>()) as u32;
1007 irq_set[0].flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_UNMASK;
1008 irq_set[0].index = index;
1009 irq_set[0].start = 0;
1010 irq_set[0].count = 1;
1011
1012 {
1013 // irq_set.data could be none, bool or descriptor according to flags, so irq_set.data is
1014 // u8 default, here irq_set.data is descriptor as u32, so 4 default u8 are combined
1015 // together as u32. It is safe as enough space is reserved through
1016 // vec_with_array_field(u32)<1>.
1017 let descriptors = unsafe { irq_set[0].data.as_mut_slice(4) };
1018 descriptors.copy_from_slice(&descriptor.as_raw_descriptor().to_le_bytes()[..]);
1019 }
1020
1021 // Safe as we are the owner of self and irq_set which are valid value
1022 let ret = unsafe { ioctl_with_ref(&self.dev, VFIO_DEVICE_SET_IRQS(), &irq_set[0]) };
1023 if ret < 0 {
1024 Err(VfioError::VfioIrqEnable(get_error()))
1025 } else {
1026 Ok(())
1027 }
1028 }
1029
1030 /// disable vfio device's irq and disconnect Irqfd Event with device
irq_disable(&self, index: u32) -> Result<()>1031 pub fn irq_disable(&self, index: u32) -> Result<()> {
1032 let mut irq_set = vec_with_array_field::<vfio_irq_set, u32>(0);
1033 irq_set[0].argsz = mem::size_of::<vfio_irq_set>() as u32;
1034 irq_set[0].flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
1035 irq_set[0].index = index;
1036 irq_set[0].start = 0;
1037 irq_set[0].count = 0;
1038
1039 // Safe as we are the owner of self and irq_set which are valid value
1040 let ret = unsafe { ioctl_with_ref(&self.dev, VFIO_DEVICE_SET_IRQS(), &irq_set[0]) };
1041 if ret < 0 {
1042 Err(VfioError::VfioIrqDisable(get_error()))
1043 } else {
1044 Ok(())
1045 }
1046 }
1047
1048 /// Unmask vfio device irq
irq_unmask(&self, index: u32) -> Result<()>1049 pub fn irq_unmask(&self, index: u32) -> Result<()> {
1050 let mut irq_set = vec_with_array_field::<vfio_irq_set, u32>(0);
1051 irq_set[0].argsz = mem::size_of::<vfio_irq_set>() as u32;
1052 irq_set[0].flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_UNMASK;
1053 irq_set[0].index = index;
1054 irq_set[0].start = 0;
1055 irq_set[0].count = 1;
1056
1057 // Safe as we are the owner of self and irq_set which are valid value
1058 let ret = unsafe { ioctl_with_ref(&self.dev, VFIO_DEVICE_SET_IRQS(), &irq_set[0]) };
1059 if ret < 0 {
1060 Err(VfioError::VfioIrqUnmask(get_error()))
1061 } else {
1062 Ok(())
1063 }
1064 }
1065
1066 /// Mask vfio device irq
irq_mask(&self, index: u32) -> Result<()>1067 pub fn irq_mask(&self, index: u32) -> Result<()> {
1068 let mut irq_set = vec_with_array_field::<vfio_irq_set, u32>(0);
1069 irq_set[0].argsz = mem::size_of::<vfio_irq_set>() as u32;
1070 irq_set[0].flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_MASK;
1071 irq_set[0].index = index;
1072 irq_set[0].start = 0;
1073 irq_set[0].count = 1;
1074
1075 // Safe as we are the owner of self and irq_set which are valid value
1076 let ret = unsafe { ioctl_with_ref(&self.dev, VFIO_DEVICE_SET_IRQS(), &irq_set[0]) };
1077 if ret < 0 {
1078 Err(VfioError::VfioIrqMask(get_error()))
1079 } else {
1080 Ok(())
1081 }
1082 }
1083
1084 /// Get and validate VFIO device information.
get_device_info(device_file: &File) -> Result<(vfio_device_info, VfioDeviceType)>1085 fn get_device_info(device_file: &File) -> Result<(vfio_device_info, VfioDeviceType)> {
1086 let mut dev_info = vfio_device_info {
1087 argsz: mem::size_of::<vfio_device_info>() as u32,
1088 flags: 0,
1089 num_regions: 0,
1090 num_irqs: 0,
1091 ..Default::default()
1092 };
1093
1094 // Safe as we are the owner of device_file and dev_info which are valid value,
1095 // and we verify the return value.
1096 let ret = unsafe { ioctl_with_mut_ref(device_file, VFIO_DEVICE_GET_INFO(), &mut dev_info) };
1097 if ret < 0 {
1098 return Err(VfioError::VfioDeviceGetInfo(get_error()));
1099 }
1100
1101 let dev_type = if (dev_info.flags & VFIO_DEVICE_FLAGS_PCI) != 0 {
1102 if dev_info.num_regions < VFIO_PCI_CONFIG_REGION_INDEX + 1
1103 || dev_info.num_irqs < VFIO_PCI_MSIX_IRQ_INDEX + 1
1104 {
1105 return Err(VfioError::VfioDeviceGetInfo(get_error()));
1106 }
1107
1108 VfioDeviceType::Pci
1109 } else if (dev_info.flags & VFIO_DEVICE_FLAGS_PLATFORM) != 0 {
1110 VfioDeviceType::Platform
1111 } else {
1112 return Err(VfioError::UnknownDeviceType(dev_info.flags));
1113 };
1114
1115 Ok((dev_info, dev_type))
1116 }
1117
1118 /// Query interrupt information
1119 /// return: Vector of interrupts information, each of which contains flags and index
get_irqs(&self) -> Result<Vec<VfioIrq>>1120 pub fn get_irqs(&self) -> Result<Vec<VfioIrq>> {
1121 let mut irqs: Vec<VfioIrq> = Vec::new();
1122
1123 for i in 0..self.num_irqs {
1124 let argsz = mem::size_of::<vfio_irq_info>() as u32;
1125 let mut irq_info = vfio_irq_info {
1126 argsz,
1127 flags: 0,
1128 index: i,
1129 count: 0,
1130 };
1131 // Safe as we are the owner of dev and irq_info which are valid value,
1132 // and we verify the return value.
1133 let ret = unsafe {
1134 ioctl_with_mut_ref(
1135 self.device_file(),
1136 VFIO_DEVICE_GET_IRQ_INFO(),
1137 &mut irq_info,
1138 )
1139 };
1140 if ret < 0 || irq_info.count != 1 {
1141 return Err(VfioError::VfioDeviceGetInfo(get_error()));
1142 }
1143
1144 let irq = VfioIrq {
1145 flags: irq_info.flags,
1146 index: irq_info.index,
1147 };
1148 irqs.push(irq);
1149 }
1150 Ok(irqs)
1151 }
1152
1153 #[allow(clippy::cast_ptr_alignment)]
get_regions(dev: &File, num_regions: u32) -> Result<Vec<VfioRegion>>1154 fn get_regions(dev: &File, num_regions: u32) -> Result<Vec<VfioRegion>> {
1155 let mut regions: Vec<VfioRegion> = Vec::new();
1156 for i in 0..num_regions {
1157 let argsz = mem::size_of::<vfio_region_info>() as u32;
1158 let mut reg_info = vfio_region_info {
1159 argsz,
1160 flags: 0,
1161 index: i,
1162 cap_offset: 0,
1163 size: 0,
1164 offset: 0,
1165 };
1166 // Safe as we are the owner of dev and reg_info which are valid value,
1167 // and we verify the return value.
1168 let ret =
1169 unsafe { ioctl_with_mut_ref(dev, VFIO_DEVICE_GET_REGION_INFO(), &mut reg_info) };
1170 if ret < 0 {
1171 continue;
1172 }
1173
1174 let mut mmaps: Vec<vfio_region_sparse_mmap_area> = Vec::new();
1175 let mut cap_info: Option<(u32, u32)> = None;
1176 if reg_info.argsz > argsz {
1177 let cap_len: usize = (reg_info.argsz - argsz) as usize;
1178 let mut region_with_cap =
1179 vec_with_array_field::<vfio_region_info_with_cap, u8>(cap_len);
1180 region_with_cap[0].region_info.argsz = reg_info.argsz;
1181 region_with_cap[0].region_info.flags = 0;
1182 region_with_cap[0].region_info.index = i;
1183 region_with_cap[0].region_info.cap_offset = 0;
1184 region_with_cap[0].region_info.size = 0;
1185 region_with_cap[0].region_info.offset = 0;
1186 // Safe as we are the owner of dev and region_info which are valid value,
1187 // and we verify the return value.
1188 let ret = unsafe {
1189 ioctl_with_mut_ref(
1190 dev,
1191 VFIO_DEVICE_GET_REGION_INFO(),
1192 &mut (region_with_cap[0].region_info),
1193 )
1194 };
1195 if ret < 0 {
1196 return Err(VfioError::VfioDeviceGetRegionInfo(get_error()));
1197 }
1198
1199 if region_with_cap[0].region_info.flags & VFIO_REGION_INFO_FLAG_CAPS == 0 {
1200 continue;
1201 }
1202
1203 let cap_header_sz = mem::size_of::<vfio_info_cap_header>() as u32;
1204 let mmap_cap_sz = mem::size_of::<vfio_region_info_cap_sparse_mmap>() as u32;
1205 let mmap_area_sz = mem::size_of::<vfio_region_sparse_mmap_area>() as u32;
1206 let type_cap_sz = mem::size_of::<vfio_region_info_cap_type>() as u32;
1207 let region_info_sz = reg_info.argsz;
1208
1209 // region_with_cap[0].cap_info may contain many structures, like
1210 // vfio_region_info_cap_sparse_mmap struct or vfio_region_info_cap_type struct.
1211 // Both of them begin with vfio_info_cap_header, so we will get individual cap from
1212 // vfio_into_cap_header.
1213 // Go through all the cap structs.
1214 let info_ptr = region_with_cap.as_ptr() as *mut u8;
1215 let mut offset = region_with_cap[0].region_info.cap_offset;
1216 while offset != 0 {
1217 if offset + cap_header_sz > region_info_sz {
1218 break;
1219 }
1220 // Safe, as cap_header struct is in this function allocated region_with_cap
1221 // vec.
1222 let cap_ptr = unsafe { info_ptr.offset(offset as isize) };
1223 let cap_header =
1224 unsafe { &*(cap_ptr as *mut u8 as *const vfio_info_cap_header) };
1225 if cap_header.id as u32 == VFIO_REGION_INFO_CAP_SPARSE_MMAP {
1226 if offset + mmap_cap_sz > region_info_sz {
1227 break;
1228 }
1229 // cap_ptr is vfio_region_info_cap_sparse_mmap here
1230 // Safe, this vfio_region_info_cap_sparse_mmap is in this function allocated
1231 // region_with_cap vec.
1232 let sparse_mmap = unsafe {
1233 &*(cap_ptr as *mut u8 as *const vfio_region_info_cap_sparse_mmap)
1234 };
1235
1236 let area_num = sparse_mmap.nr_areas;
1237 if offset + mmap_cap_sz + area_num * mmap_area_sz > region_info_sz {
1238 break;
1239 }
1240 // Safe, these vfio_region_sparse_mmap_area are in this function allocated
1241 // region_with_cap vec.
1242 let areas =
1243 unsafe { sparse_mmap.areas.as_slice(sparse_mmap.nr_areas as usize) };
1244 for area in areas.iter() {
1245 mmaps.push(*area);
1246 }
1247 } else if cap_header.id as u32 == VFIO_REGION_INFO_CAP_TYPE {
1248 if offset + type_cap_sz > region_info_sz {
1249 break;
1250 }
1251 // cap_ptr is vfio_region_info_cap_type here
1252 // Safe, this vfio_region_info_cap_type is in this function allocated
1253 // region_with_cap vec
1254 let cap_type_info =
1255 unsafe { &*(cap_ptr as *mut u8 as *const vfio_region_info_cap_type) };
1256
1257 cap_info = Some((cap_type_info.type_, cap_type_info.subtype));
1258 } else if cap_header.id as u32 == VFIO_REGION_INFO_CAP_MSIX_MAPPABLE {
1259 mmaps.push(vfio_region_sparse_mmap_area {
1260 offset: 0,
1261 size: region_with_cap[0].region_info.size,
1262 });
1263 }
1264
1265 offset = cap_header.next;
1266 }
1267 } else if reg_info.flags & VFIO_REGION_INFO_FLAG_MMAP != 0 {
1268 mmaps.push(vfio_region_sparse_mmap_area {
1269 offset: 0,
1270 size: reg_info.size,
1271 });
1272 }
1273
1274 let region = VfioRegion {
1275 flags: reg_info.flags,
1276 size: reg_info.size,
1277 offset: reg_info.offset,
1278 mmaps,
1279 cap_info,
1280 };
1281 regions.push(region);
1282 }
1283
1284 Ok(regions)
1285 }
1286
1287 /// get a region's flag
1288 /// the return's value may conatin:
1289 /// VFIO_REGION_INFO_FLAG_READ: region supports read
1290 /// VFIO_REGION_INFO_FLAG_WRITE: region supports write
1291 /// VFIO_REGION_INFO_FLAG_MMAP: region supports mmap
1292 /// VFIO_REGION_INFO_FLAG_CAPS: region's info supports caps
get_region_flags(&self, index: u32) -> u321293 pub fn get_region_flags(&self, index: u32) -> u32 {
1294 match self.regions.get(index as usize) {
1295 Some(v) => v.flags,
1296 None => {
1297 warn!("get_region_flags() with invalid index: {}", index);
1298 0
1299 }
1300 }
1301 }
1302
1303 /// get a region's offset
1304 /// return: Region offset from the start of vfio device descriptor
get_region_offset(&self, index: u32) -> u641305 pub fn get_region_offset(&self, index: u32) -> u64 {
1306 match self.regions.get(index as usize) {
1307 Some(v) => v.offset,
1308 None => {
1309 warn!("get_region_offset with invalid index: {}", index);
1310 0
1311 }
1312 }
1313 }
1314
1315 /// get a region's size
1316 /// return: Region size from the start of vfio device descriptor
get_region_size(&self, index: u32) -> u641317 pub fn get_region_size(&self, index: u32) -> u64 {
1318 match self.regions.get(index as usize) {
1319 Some(v) => v.size,
1320 None => {
1321 warn!("get_region_size with invalid index: {}", index);
1322 0
1323 }
1324 }
1325 }
1326
1327 /// get a number of regions
1328 /// return: Number of regions of vfio device descriptor
get_region_count(&self) -> u321329 pub fn get_region_count(&self) -> u32 {
1330 self.regions.len() as u32
1331 }
1332
1333 /// get a region's mmap info vector
get_region_mmap(&self, index: u32) -> Vec<vfio_region_sparse_mmap_area>1334 pub fn get_region_mmap(&self, index: u32) -> Vec<vfio_region_sparse_mmap_area> {
1335 match self.regions.get(index as usize) {
1336 Some(v) => v.mmaps.clone(),
1337 None => {
1338 warn!("get_region_mmap with invalid index: {}", index);
1339 Vec::new()
1340 }
1341 }
1342 }
1343
1344 /// find the specified cap type in device regions
1345 /// Input:
1346 /// type_: cap type
1347 /// sub_type: cap sub_type
1348 /// Output:
1349 /// None: device doesn't have the specified cap type
1350 /// Some((bar_index, region_size)): device has the specified cap type, return region's
1351 /// index and size
get_cap_type_info(&self, type_: u32, sub_type: u32) -> Option<(u32, u64)>1352 pub fn get_cap_type_info(&self, type_: u32, sub_type: u32) -> Option<(u32, u64)> {
1353 for (index, region) in self.regions.iter().enumerate() {
1354 if let Some(cap_info) = ®ion.cap_info {
1355 if cap_info.0 == type_ && cap_info.1 == sub_type {
1356 return Some((index as u32, region.size));
1357 }
1358 }
1359 }
1360
1361 None
1362 }
1363
1364 /// Returns file offset corresponding to the given `VfioRegionAddr`.
1365 /// The offset can be used when reading/writing the VFIO device's FD directly.
get_offset_for_addr(&self, addr: &VfioRegionAddr) -> Result<u64>1366 pub fn get_offset_for_addr(&self, addr: &VfioRegionAddr) -> Result<u64> {
1367 let region = self
1368 .regions
1369 .get(addr.index as usize)
1370 .ok_or(VfioError::InvalidIndex(addr.index))?;
1371 Ok(region.offset + addr.addr)
1372 }
1373
1374 /// Read region's data from VFIO device into buf
1375 /// index: region num
1376 /// buf: data destination and buf length is read size
1377 /// addr: offset in the region
region_read(&self, index: u32, buf: &mut [u8], addr: u64)1378 pub fn region_read(&self, index: u32, buf: &mut [u8], addr: u64) {
1379 let stub: &VfioRegion = self
1380 .regions
1381 .get(index as usize)
1382 .unwrap_or_else(|| panic!("tried to read VFIO with an invalid index: {}", index));
1383
1384 let size = buf.len() as u64;
1385 if size > stub.size || addr + size > stub.size {
1386 panic!(
1387 "tried to read VFIO region with invalid arguments: index={}, addr=0x{:x}, size=0x{:x}",
1388 index, addr, size
1389 );
1390 }
1391
1392 self.dev
1393 .read_exact_at(buf, stub.offset + addr)
1394 .unwrap_or_else(|e| {
1395 panic!(
1396 "failed to read region: index={}, addr=0x{:x}, error={}",
1397 index, addr, e
1398 )
1399 });
1400 }
1401
1402 /// Reads a value from the specified `VfioRegionAddr.addr` + `offset`.
region_read_from_addr<T: FromBytes>(&self, addr: &VfioRegionAddr, offset: u64) -> T1403 pub fn region_read_from_addr<T: FromBytes>(&self, addr: &VfioRegionAddr, offset: u64) -> T {
1404 let mut val = mem::MaybeUninit::zeroed();
1405 // Safe because we have zero-initialized `size_of::<T>()` bytes.
1406 let buf =
1407 unsafe { slice::from_raw_parts_mut(val.as_mut_ptr() as *mut u8, mem::size_of::<T>()) };
1408 self.region_read(addr.index, buf, addr.addr + offset);
1409 // Safe because any bit pattern is valid for a type that implements FromBytes.
1410 unsafe { val.assume_init() }
1411 }
1412
1413 /// write the data from buf into a vfio device region
1414 /// index: region num
1415 /// buf: data src and buf length is write size
1416 /// addr: offset in the region
region_write(&self, index: u32, buf: &[u8], addr: u64)1417 pub fn region_write(&self, index: u32, buf: &[u8], addr: u64) {
1418 let stub: &VfioRegion = self
1419 .regions
1420 .get(index as usize)
1421 .unwrap_or_else(|| panic!("tried to write VFIO with an invalid index: {}", index));
1422
1423 let size = buf.len() as u64;
1424 if size > stub.size
1425 || addr + size > stub.size
1426 || (stub.flags & VFIO_REGION_INFO_FLAG_WRITE) == 0
1427 {
1428 panic!(
1429 "tried to write VFIO region with invalid arguments: index={}, addr=0x{:x}, size=0x{:x}",
1430 index, addr, size
1431 );
1432 }
1433
1434 self.dev
1435 .write_all_at(buf, stub.offset + addr)
1436 .unwrap_or_else(|e| {
1437 panic!(
1438 "failed to write region: index={}, addr=0x{:x}, error={}",
1439 index, addr, e
1440 )
1441 });
1442 }
1443
1444 /// Writes data into the specified `VfioRegionAddr.addr` + `offset`.
region_write_to_addr<T: AsBytes>(&self, val: &T, addr: &VfioRegionAddr, offset: u64)1445 pub fn region_write_to_addr<T: AsBytes>(&self, val: &T, addr: &VfioRegionAddr, offset: u64) {
1446 self.region_write(addr.index, val.as_bytes(), addr.addr + offset);
1447 }
1448
1449 /// get vfio device's descriptors which are passed into minijail process
keep_rds(&self) -> Vec<RawDescriptor>1450 pub fn keep_rds(&self) -> Vec<RawDescriptor> {
1451 vec![
1452 self.dev.as_raw_descriptor(),
1453 self.group_descriptor,
1454 self.container.lock().as_raw_descriptor(),
1455 ]
1456 }
1457
1458 /// Add (iova, user_addr) map into vfio container iommu table
vfio_dma_map( &self, iova: u64, size: u64, user_addr: u64, write_en: bool, ) -> Result<()>1459 pub unsafe fn vfio_dma_map(
1460 &self,
1461 iova: u64,
1462 size: u64,
1463 user_addr: u64,
1464 write_en: bool,
1465 ) -> Result<()> {
1466 self.container
1467 .lock()
1468 .vfio_dma_map(iova, size, user_addr, write_en)
1469 }
1470
1471 /// Remove (iova, user_addr) map from vfio container iommu table
vfio_dma_unmap(&self, iova: u64, size: u64) -> Result<()>1472 pub fn vfio_dma_unmap(&self, iova: u64, size: u64) -> Result<()> {
1473 self.container.lock().vfio_dma_unmap(iova, size)
1474 }
1475
vfio_get_iommu_page_size_mask(&self) -> Result<u64>1476 pub fn vfio_get_iommu_page_size_mask(&self) -> Result<u64> {
1477 self.container.lock().vfio_get_iommu_page_size_mask()
1478 }
1479
alloc_iova(&self, size: u64, align_size: u64, alloc: Alloc) -> Result<u64>1480 pub fn alloc_iova(&self, size: u64, align_size: u64, alloc: Alloc) -> Result<u64> {
1481 self.iova_alloc
1482 .lock()
1483 .allocate_with_align(size, alloc, "alloc_iova".to_owned(), align_size)
1484 .map_err(VfioError::Resources)
1485 }
1486
get_iova(&self, alloc: &Alloc) -> Option<AddressRange>1487 pub fn get_iova(&self, alloc: &Alloc) -> Option<AddressRange> {
1488 self.iova_alloc.lock().get(alloc).map(|res| res.0)
1489 }
1490
release_iova(&self, alloc: Alloc) -> Result<AddressRange>1491 pub fn release_iova(&self, alloc: Alloc) -> Result<AddressRange> {
1492 self.iova_alloc
1493 .lock()
1494 .release(alloc)
1495 .map_err(VfioError::Resources)
1496 }
1497
get_max_addr(&self) -> u641498 pub fn get_max_addr(&self) -> u64 {
1499 self.iova_alloc.lock().get_max_addr()
1500 }
1501
1502 /// Gets the vfio device backing `File`.
device_file(&self) -> &File1503 pub fn device_file(&self) -> &File {
1504 &self.dev
1505 }
1506
1507 /// close vfio device
close(&self)1508 pub fn close(&self) {
1509 self.container.lock().remove_group(self.group_id, true);
1510 }
1511 }
1512
1513 pub struct VfioPciConfig {
1514 device: Arc<VfioDevice>,
1515 }
1516
1517 impl VfioPciConfig {
new(device: Arc<VfioDevice>) -> Self1518 pub fn new(device: Arc<VfioDevice>) -> Self {
1519 VfioPciConfig { device }
1520 }
1521
read_config<T: FromBytes>(&self, offset: u32) -> T1522 pub fn read_config<T: FromBytes>(&self, offset: u32) -> T {
1523 let mut buf = vec![0u8; std::mem::size_of::<T>()];
1524 self.device
1525 .region_read(VFIO_PCI_CONFIG_REGION_INDEX, &mut buf, offset.into());
1526 T::read_from(&buf[..]).expect("failed to convert config data from slice")
1527 }
1528
write_config<T: AsBytes>(&self, config: T, offset: u32)1529 pub fn write_config<T: AsBytes>(&self, config: T, offset: u32) {
1530 self.device.region_write(
1531 VFIO_PCI_CONFIG_REGION_INDEX,
1532 config.as_bytes(),
1533 offset.into(),
1534 );
1535 }
1536
1537 /// Set the VFIO device this config refers to as the bus master.
set_bus_master(&self)1538 pub fn set_bus_master(&self) {
1539 /// Constant definitions from `linux/pci_regs.h`.
1540 const PCI_COMMAND: u32 = 0x4;
1541 /// Enable bus mastering
1542 const PCI_COMMAND_MASTER: u16 = 0x4;
1543
1544 let mut cmd: u16 = self.read_config(PCI_COMMAND);
1545
1546 if cmd & PCI_COMMAND_MASTER != 0 {
1547 return;
1548 }
1549
1550 cmd |= PCI_COMMAND_MASTER;
1551
1552 self.write_config(cmd, PCI_COMMAND);
1553 }
1554 }
1555
1556 impl AsRawDescriptor for VfioDevice {
as_raw_descriptor(&self) -> RawDescriptor1557 fn as_raw_descriptor(&self) -> RawDescriptor {
1558 self.dev.as_raw_descriptor()
1559 }
1560 }
1561