• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2020 The ChromiumOS Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
6 mod aarch64;
7 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
8 pub use aarch64::*;
9 use base::sys::BlockedSignal;
10 
11 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
12 mod x86_64;
13 use std::cell::RefCell;
14 use std::cmp::min;
15 use std::cmp::Reverse;
16 use std::collections::BTreeMap;
17 use std::collections::BinaryHeap;
18 use std::convert::TryFrom;
19 use std::ffi::CString;
20 use std::mem::size_of;
21 use std::mem::ManuallyDrop;
22 use std::os::raw::c_int;
23 use std::os::raw::c_ulong;
24 use std::os::raw::c_void;
25 use std::os::unix::prelude::OsStrExt;
26 use std::path::Path;
27 use std::path::PathBuf;
28 use std::ptr::copy_nonoverlapping;
29 use std::sync::atomic::AtomicU64;
30 use std::sync::Arc;
31 
32 use base::errno_result;
33 use base::error;
34 use base::ioctl;
35 use base::ioctl_with_mut_ref;
36 use base::ioctl_with_ref;
37 use base::ioctl_with_val;
38 use base::pagesize;
39 use base::signal;
40 use base::AsRawDescriptor;
41 use base::Error;
42 use base::Event;
43 use base::FromRawDescriptor;
44 use base::MappedRegion;
45 use base::MemoryMapping;
46 use base::MemoryMappingBuilder;
47 use base::MemoryMappingBuilderUnix;
48 use base::MmapError;
49 use base::Protection;
50 use base::RawDescriptor;
51 use base::Result;
52 use base::SafeDescriptor;
53 use data_model::vec_with_array_field;
54 use kvm_sys::*;
55 use libc::open64;
56 use libc::sigset_t;
57 use libc::EBUSY;
58 use libc::EFAULT;
59 use libc::EINVAL;
60 use libc::EIO;
61 use libc::ENOENT;
62 use libc::ENOSPC;
63 use libc::ENOSYS;
64 use libc::EOVERFLOW;
65 use libc::O_CLOEXEC;
66 use libc::O_RDWR;
67 use sync::Mutex;
68 use vm_memory::GuestAddress;
69 use vm_memory::GuestMemory;
70 use vm_memory::MemoryRegionInformation;
71 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
72 pub use x86_64::*;
73 
74 use crate::ClockState;
75 use crate::Config;
76 use crate::Datamatch;
77 use crate::DeviceKind;
78 use crate::HypervHypercall;
79 use crate::Hypervisor;
80 use crate::HypervisorCap;
81 use crate::IoEventAddress;
82 use crate::IoOperation;
83 use crate::IoParams;
84 use crate::IrqRoute;
85 use crate::IrqSource;
86 use crate::MPState;
87 use crate::MemSlot;
88 use crate::Vcpu;
89 use crate::VcpuExit;
90 use crate::VcpuRunHandle;
91 use crate::Vm;
92 use crate::VmCap;
93 
94 // Wrapper around KVM_SET_USER_MEMORY_REGION ioctl, which creates, modifies, or deletes a mapping
95 // from guest physical to host user pages.
96 //
97 // Safe when the guest regions are guaranteed not to overlap.
set_user_memory_region( descriptor: &SafeDescriptor, slot: MemSlot, read_only: bool, log_dirty_pages: bool, guest_addr: u64, memory_size: u64, userspace_addr: *mut u8, ) -> Result<()>98 unsafe fn set_user_memory_region(
99     descriptor: &SafeDescriptor,
100     slot: MemSlot,
101     read_only: bool,
102     log_dirty_pages: bool,
103     guest_addr: u64,
104     memory_size: u64,
105     userspace_addr: *mut u8,
106 ) -> Result<()> {
107     let mut flags = if read_only { KVM_MEM_READONLY } else { 0 };
108     if log_dirty_pages {
109         flags |= KVM_MEM_LOG_DIRTY_PAGES;
110     }
111     let region = kvm_userspace_memory_region {
112         slot,
113         flags,
114         guest_phys_addr: guest_addr,
115         memory_size,
116         userspace_addr: userspace_addr as u64,
117     };
118 
119     let ret = ioctl_with_ref(descriptor, KVM_SET_USER_MEMORY_REGION(), &region);
120     if ret == 0 {
121         Ok(())
122     } else {
123         errno_result()
124     }
125 }
126 
127 /// Helper function to determine the size in bytes of a dirty log bitmap for the given memory region
128 /// size.
129 ///
130 /// # Arguments
131 ///
132 /// * `size` - Number of bytes in the memory region being queried.
dirty_log_bitmap_size(size: usize) -> usize133 pub fn dirty_log_bitmap_size(size: usize) -> usize {
134     let page_size = pagesize();
135     (((size + page_size - 1) / page_size) + 7) / 8
136 }
137 
138 pub struct Kvm {
139     kvm: SafeDescriptor,
140 }
141 
142 pub type KvmCap = kvm::Cap;
143 
144 impl Kvm {
new_with_path(device_path: &Path) -> Result<Kvm>145     pub fn new_with_path(device_path: &Path) -> Result<Kvm> {
146         // Open calls are safe because we give a nul-terminated string and verify the result.
147         let c_path = CString::new(device_path.as_os_str().as_bytes()).unwrap();
148         let ret = unsafe { open64(c_path.as_ptr(), O_RDWR | O_CLOEXEC) };
149         if ret < 0 {
150             return errno_result();
151         }
152         // Safe because we verify that ret is valid and we own the fd.
153         let kvm = unsafe { SafeDescriptor::from_raw_descriptor(ret) };
154 
155         // Safe because we know that the descriptor is valid and we verify the return result.
156         let version = unsafe { ioctl(&kvm, KVM_GET_API_VERSION()) };
157         if version < 0 {
158             return errno_result();
159         }
160 
161         // Per the kernel KVM API documentation: "Applications should refuse to run if
162         // KVM_GET_API_VERSION returns a value other than 12."
163         if version as u32 != KVM_API_VERSION {
164             error!(
165                 "KVM_GET_API_VERSION: expected {}, got {}",
166                 KVM_API_VERSION, version,
167             );
168             return Err(Error::new(ENOSYS));
169         }
170 
171         Ok(Kvm { kvm })
172     }
173 
174     /// Opens `/dev/kvm/` and returns a Kvm object on success.
new() -> Result<Kvm>175     pub fn new() -> Result<Kvm> {
176         Kvm::new_with_path(&PathBuf::from("/dev/kvm"))
177     }
178 
179     /// Gets the size of the mmap required to use vcpu's `kvm_run` structure.
get_vcpu_mmap_size(&self) -> Result<usize>180     pub fn get_vcpu_mmap_size(&self) -> Result<usize> {
181         // Safe because we know that our file is a KVM fd and we verify the return result.
182         let res = unsafe { ioctl(self, KVM_GET_VCPU_MMAP_SIZE()) };
183         if res > 0 {
184             Ok(res as usize)
185         } else {
186             errno_result()
187         }
188     }
189 }
190 
191 impl AsRawDescriptor for Kvm {
as_raw_descriptor(&self) -> RawDescriptor192     fn as_raw_descriptor(&self) -> RawDescriptor {
193         self.kvm.as_raw_descriptor()
194     }
195 }
196 
197 impl Hypervisor for Kvm {
try_clone(&self) -> Result<Self>198     fn try_clone(&self) -> Result<Self> {
199         Ok(Kvm {
200             kvm: self.kvm.try_clone()?,
201         })
202     }
203 
check_capability(&self, cap: HypervisorCap) -> bool204     fn check_capability(&self, cap: HypervisorCap) -> bool {
205         if let Ok(kvm_cap) = KvmCap::try_from(cap) {
206             // this ioctl is safe because we know this kvm descriptor is valid,
207             // and we are copying over the kvm capability (u32) as a c_ulong value.
208             unsafe { ioctl_with_val(self, KVM_CHECK_EXTENSION(), kvm_cap as c_ulong) == 1 }
209         } else {
210             // this capability cannot be converted on this platform, so return false
211             false
212         }
213     }
214 }
215 
216 /// A wrapper around creating and using a KVM VM.
217 pub struct KvmVm {
218     kvm: Kvm,
219     vm: SafeDescriptor,
220     guest_mem: GuestMemory,
221     mem_regions: Arc<Mutex<BTreeMap<MemSlot, Box<dyn MappedRegion>>>>,
222     /// A min heap of MemSlot numbers that were used and then removed and can now be re-used
223     mem_slot_gaps: Arc<Mutex<BinaryHeap<Reverse<MemSlot>>>>,
224 }
225 
226 impl KvmVm {
227     /// Constructs a new `KvmVm` using the given `Kvm` instance.
new(kvm: &Kvm, guest_mem: GuestMemory, cfg: Config) -> Result<KvmVm>228     pub fn new(kvm: &Kvm, guest_mem: GuestMemory, cfg: Config) -> Result<KvmVm> {
229         // Safe because we know kvm is a real kvm fd as this module is the only one that can make
230         // Kvm objects.
231         let ret = unsafe {
232             ioctl_with_val(
233                 kvm,
234                 KVM_CREATE_VM(),
235                 kvm.get_vm_type(cfg.protection_type)? as c_ulong,
236             )
237         };
238         if ret < 0 {
239             return errno_result();
240         }
241         // Safe because we verify that ret is valid and we own the fd.
242         let vm_descriptor = unsafe { SafeDescriptor::from_raw_descriptor(ret) };
243         guest_mem.with_regions(
244             |MemoryRegionInformation {
245                  index,
246                  guest_addr,
247                  size,
248                  host_addr,
249                  ..
250              }| {
251                 unsafe {
252                     // Safe because the guest regions are guaranteed not to overlap.
253                     set_user_memory_region(
254                         &vm_descriptor,
255                         index as MemSlot,
256                         false,
257                         false,
258                         guest_addr.offset(),
259                         size as u64,
260                         host_addr as *mut u8,
261                     )
262                 }
263             },
264         )?;
265 
266         let vm = KvmVm {
267             kvm: kvm.try_clone()?,
268             vm: vm_descriptor,
269             guest_mem,
270             mem_regions: Arc::new(Mutex::new(BTreeMap::new())),
271             mem_slot_gaps: Arc::new(Mutex::new(BinaryHeap::new())),
272         };
273         vm.init_arch(&cfg)?;
274         Ok(vm)
275     }
276 
create_kvm_vcpu(&self, id: usize) -> Result<KvmVcpu>277     pub fn create_kvm_vcpu(&self, id: usize) -> Result<KvmVcpu> {
278         let run_mmap_size = self.kvm.get_vcpu_mmap_size()?;
279 
280         // Safe because we know that our file is a VM fd and we verify the return result.
281         let fd = unsafe { ioctl_with_val(self, KVM_CREATE_VCPU(), c_ulong::try_from(id).unwrap()) };
282         if fd < 0 {
283             return errno_result();
284         }
285 
286         // Wrap the vcpu now in case the following ? returns early. This is safe because we verified
287         // the value of the fd and we own the fd.
288         let vcpu = unsafe { SafeDescriptor::from_raw_descriptor(fd) };
289 
290         let run_mmap = MemoryMappingBuilder::new(run_mmap_size)
291             .from_descriptor(&vcpu)
292             .build()
293             .map_err(|_| Error::new(ENOSPC))?;
294 
295         Ok(KvmVcpu {
296             kvm: self.kvm.try_clone()?,
297             vm: self.vm.try_clone()?,
298             vcpu,
299             id,
300             run_mmap,
301             vcpu_run_handle_fingerprint: Default::default(),
302         })
303     }
304 
305     /// Creates an in kernel interrupt controller.
306     ///
307     /// See the documentation on the KVM_CREATE_IRQCHIP ioctl.
create_irq_chip(&self) -> Result<()>308     pub fn create_irq_chip(&self) -> Result<()> {
309         // Safe because we know that our file is a VM fd and we verify the return result.
310         let ret = unsafe { ioctl(self, KVM_CREATE_IRQCHIP()) };
311         if ret == 0 {
312             Ok(())
313         } else {
314             errno_result()
315         }
316     }
317 
318     /// Sets the level on the given irq to 1 if `active` is true, and 0 otherwise.
set_irq_line(&self, irq: u32, active: bool) -> Result<()>319     pub fn set_irq_line(&self, irq: u32, active: bool) -> Result<()> {
320         let mut irq_level = kvm_irq_level::default();
321         irq_level.__bindgen_anon_1.irq = irq;
322         irq_level.level = active.into();
323 
324         // Safe because we know that our file is a VM fd, we know the kernel will only read the
325         // correct amount of memory from our pointer, and we verify the return result.
326         let ret = unsafe { ioctl_with_ref(self, KVM_IRQ_LINE(), &irq_level) };
327         if ret == 0 {
328             Ok(())
329         } else {
330             errno_result()
331         }
332     }
333 
334     /// Registers an event that will, when signalled, trigger the `gsi` irq, and `resample_evt`
335     /// ( when not None ) will be triggered when the irqchip is resampled.
register_irqfd( &self, gsi: u32, evt: &Event, resample_evt: Option<&Event>, ) -> Result<()>336     pub fn register_irqfd(
337         &self,
338         gsi: u32,
339         evt: &Event,
340         resample_evt: Option<&Event>,
341     ) -> Result<()> {
342         let mut irqfd = kvm_irqfd {
343             fd: evt.as_raw_descriptor() as u32,
344             gsi,
345             ..Default::default()
346         };
347 
348         if let Some(r_evt) = resample_evt {
349             irqfd.flags = KVM_IRQFD_FLAG_RESAMPLE;
350             irqfd.resamplefd = r_evt.as_raw_descriptor() as u32;
351         }
352 
353         // Safe because we know that our file is a VM fd, we know the kernel will only read the
354         // correct amount of memory from our pointer, and we verify the return result.
355         let ret = unsafe { ioctl_with_ref(self, KVM_IRQFD(), &irqfd) };
356         if ret == 0 {
357             Ok(())
358         } else {
359             errno_result()
360         }
361     }
362 
363     /// Unregisters an event that was previously registered with
364     /// `register_irqfd`.
365     ///
366     /// The `evt` and `gsi` pair must be the same as the ones passed into
367     /// `register_irqfd`.
unregister_irqfd(&self, gsi: u32, evt: &Event) -> Result<()>368     pub fn unregister_irqfd(&self, gsi: u32, evt: &Event) -> Result<()> {
369         let irqfd = kvm_irqfd {
370             fd: evt.as_raw_descriptor() as u32,
371             gsi,
372             flags: KVM_IRQFD_FLAG_DEASSIGN,
373             ..Default::default()
374         };
375         // Safe because we know that our file is a VM fd, we know the kernel will only read the
376         // correct amount of memory from our pointer, and we verify the return result.
377         let ret = unsafe { ioctl_with_ref(self, KVM_IRQFD(), &irqfd) };
378         if ret == 0 {
379             Ok(())
380         } else {
381             errno_result()
382         }
383     }
384 
385     /// Sets the GSI routing table, replacing any table set with previous calls to
386     /// `set_gsi_routing`.
set_gsi_routing(&self, routes: &[IrqRoute]) -> Result<()>387     pub fn set_gsi_routing(&self, routes: &[IrqRoute]) -> Result<()> {
388         let mut irq_routing =
389             vec_with_array_field::<kvm_irq_routing, kvm_irq_routing_entry>(routes.len());
390         irq_routing[0].nr = routes.len() as u32;
391 
392         // Safe because we ensured there is enough space in irq_routing to hold the number of
393         // route entries.
394         let irq_routes = unsafe { irq_routing[0].entries.as_mut_slice(routes.len()) };
395         for (route, irq_route) in routes.iter().zip(irq_routes.iter_mut()) {
396             *irq_route = kvm_irq_routing_entry::from(route);
397         }
398 
399         let ret = unsafe { ioctl_with_ref(self, KVM_SET_GSI_ROUTING(), &irq_routing[0]) };
400         if ret == 0 {
401             Ok(())
402         } else {
403             errno_result()
404         }
405     }
406 
ioeventfd( &self, evt: &Event, addr: IoEventAddress, datamatch: Datamatch, deassign: bool, ) -> Result<()>407     fn ioeventfd(
408         &self,
409         evt: &Event,
410         addr: IoEventAddress,
411         datamatch: Datamatch,
412         deassign: bool,
413     ) -> Result<()> {
414         let (do_datamatch, datamatch_value, datamatch_len) = match datamatch {
415             Datamatch::AnyLength => (false, 0, 0),
416             Datamatch::U8(v) => match v {
417                 Some(u) => (true, u as u64, 1),
418                 None => (false, 0, 1),
419             },
420             Datamatch::U16(v) => match v {
421                 Some(u) => (true, u as u64, 2),
422                 None => (false, 0, 2),
423             },
424             Datamatch::U32(v) => match v {
425                 Some(u) => (true, u as u64, 4),
426                 None => (false, 0, 4),
427             },
428             Datamatch::U64(v) => match v {
429                 Some(u) => (true, u as u64, 8),
430                 None => (false, 0, 8),
431             },
432         };
433         let mut flags = 0;
434         if deassign {
435             flags |= 1 << kvm_ioeventfd_flag_nr_deassign;
436         }
437         if do_datamatch {
438             flags |= 1 << kvm_ioeventfd_flag_nr_datamatch
439         }
440         if let IoEventAddress::Pio(_) = addr {
441             flags |= 1 << kvm_ioeventfd_flag_nr_pio;
442         }
443         let ioeventfd = kvm_ioeventfd {
444             datamatch: datamatch_value,
445             len: datamatch_len,
446             addr: match addr {
447                 IoEventAddress::Pio(p) => p as u64,
448                 IoEventAddress::Mmio(m) => m,
449             },
450             fd: evt.as_raw_descriptor(),
451             flags,
452             ..Default::default()
453         };
454         // Safe because we know that our file is a VM fd, we know the kernel will only read the
455         // correct amount of memory from our pointer, and we verify the return result.
456         let ret = unsafe { ioctl_with_ref(self, KVM_IOEVENTFD(), &ioeventfd) };
457         if ret == 0 {
458             Ok(())
459         } else {
460             errno_result()
461         }
462     }
463 
464     /// Checks whether a particular KVM-specific capability is available for this VM.
check_raw_capability(&self, capability: KvmCap) -> bool465     pub fn check_raw_capability(&self, capability: KvmCap) -> bool {
466         // Safe because we know that our file is a KVM fd, and if the cap is invalid KVM assumes
467         // it's an unavailable extension and returns 0.
468         let ret = unsafe { ioctl_with_val(self, KVM_CHECK_EXTENSION(), capability as c_ulong) };
469         match capability {
470             #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
471             KvmCap::BusLockDetect => {
472                 if ret > 0 {
473                     ret as u32 & KVM_BUS_LOCK_DETECTION_EXIT == KVM_BUS_LOCK_DETECTION_EXIT
474                 } else {
475                     false
476                 }
477             }
478             _ => ret == 1,
479         }
480     }
481 
482     // Currently only used on aarch64, but works on any architecture.
483     #[allow(dead_code)]
484     /// Enables a KVM-specific capability for this VM, with the given arguments.
485     ///
486     /// # Safety
487     /// This function is marked as unsafe because `args` may be interpreted as pointers for some
488     /// capabilities. The caller must ensure that any pointers passed in the `args` array are
489     /// allocated as the kernel expects, and that mutable pointers are owned.
enable_raw_capability( &self, capability: KvmCap, flags: u32, args: &[u64; 4], ) -> Result<()>490     unsafe fn enable_raw_capability(
491         &self,
492         capability: KvmCap,
493         flags: u32,
494         args: &[u64; 4],
495     ) -> Result<()> {
496         let kvm_cap = kvm_enable_cap {
497             cap: capability as u32,
498             args: *args,
499             flags,
500             ..Default::default()
501         };
502         // Safe because we allocated the struct and we know the kernel will read exactly the size of
503         // the struct, and because we assume the caller has allocated the args appropriately.
504         let ret = ioctl_with_ref(self, KVM_ENABLE_CAP(), &kvm_cap);
505         if ret == 0 {
506             Ok(())
507         } else {
508             errno_result()
509         }
510     }
511 }
512 
513 impl Vm for KvmVm {
try_clone(&self) -> Result<Self>514     fn try_clone(&self) -> Result<Self> {
515         Ok(KvmVm {
516             kvm: self.kvm.try_clone()?,
517             vm: self.vm.try_clone()?,
518             guest_mem: self.guest_mem.clone(),
519             mem_regions: self.mem_regions.clone(),
520             mem_slot_gaps: self.mem_slot_gaps.clone(),
521         })
522     }
523 
check_capability(&self, c: VmCap) -> bool524     fn check_capability(&self, c: VmCap) -> bool {
525         if let Some(val) = self.check_capability_arch(c) {
526             return val;
527         }
528         match c {
529             VmCap::DirtyLog => true,
530             VmCap::PvClock => false,
531             VmCap::PvClockSuspend => self.check_raw_capability(KvmCap::KvmclockCtrl),
532             VmCap::Protected => self.check_raw_capability(KvmCap::ArmProtectedVm),
533             VmCap::EarlyInitCpuid => false,
534             #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
535             VmCap::BusLockDetect => self.check_raw_capability(KvmCap::BusLockDetect),
536         }
537     }
538 
enable_capability(&self, c: VmCap, _flags: u32) -> Result<bool>539     fn enable_capability(&self, c: VmCap, _flags: u32) -> Result<bool> {
540         match c {
541             #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
542             VmCap::BusLockDetect => {
543                 let args = [KVM_BUS_LOCK_DETECTION_EXIT as u64, 0, 0, 0];
544                 Ok(unsafe {
545                     self.enable_raw_capability(KvmCap::BusLockDetect, _flags, &args) == Ok(())
546                 })
547             }
548             _ => Ok(false),
549         }
550     }
551 
get_guest_phys_addr_bits(&self) -> u8552     fn get_guest_phys_addr_bits(&self) -> u8 {
553         self.kvm.get_guest_phys_addr_bits()
554     }
555 
get_memory(&self) -> &GuestMemory556     fn get_memory(&self) -> &GuestMemory {
557         &self.guest_mem
558     }
559 
add_memory_region( &mut self, guest_addr: GuestAddress, mem: Box<dyn MappedRegion>, read_only: bool, log_dirty_pages: bool, ) -> Result<MemSlot>560     fn add_memory_region(
561         &mut self,
562         guest_addr: GuestAddress,
563         mem: Box<dyn MappedRegion>,
564         read_only: bool,
565         log_dirty_pages: bool,
566     ) -> Result<MemSlot> {
567         let pgsz = pagesize() as u64;
568         // KVM require to set the user memory region with page size aligned size. Safe to extend
569         // the mem.size() to be page size aligned because the mmap will round up the size to be
570         // page size aligned if it is not.
571         let size = (mem.size() as u64 + pgsz - 1) / pgsz * pgsz;
572         let end_addr = guest_addr
573             .checked_add(size)
574             .ok_or_else(|| Error::new(EOVERFLOW))?;
575         if self.guest_mem.range_overlap(guest_addr, end_addr) {
576             return Err(Error::new(ENOSPC));
577         }
578         let mut regions = self.mem_regions.lock();
579         let mut gaps = self.mem_slot_gaps.lock();
580         let slot = match gaps.pop() {
581             Some(gap) => gap.0,
582             None => (regions.len() + self.guest_mem.num_regions() as usize) as MemSlot,
583         };
584 
585         // Safe because we check that the given guest address is valid and has no overlaps. We also
586         // know that the pointer and size are correct because the MemoryMapping interface ensures
587         // this. We take ownership of the memory mapping so that it won't be unmapped until the slot
588         // is removed.
589         let res = unsafe {
590             set_user_memory_region(
591                 &self.vm,
592                 slot,
593                 read_only,
594                 log_dirty_pages,
595                 guest_addr.offset() as u64,
596                 size,
597                 mem.as_ptr(),
598             )
599         };
600 
601         if let Err(e) = res {
602             gaps.push(Reverse(slot));
603             return Err(e);
604         }
605         regions.insert(slot, mem);
606         Ok(slot)
607     }
608 
msync_memory_region(&mut self, slot: MemSlot, offset: usize, size: usize) -> Result<()>609     fn msync_memory_region(&mut self, slot: MemSlot, offset: usize, size: usize) -> Result<()> {
610         let mut regions = self.mem_regions.lock();
611         let mem = regions.get_mut(&slot).ok_or_else(|| Error::new(ENOENT))?;
612 
613         mem.msync(offset, size).map_err(|err| match err {
614             MmapError::InvalidAddress => Error::new(EFAULT),
615             MmapError::NotPageAligned => Error::new(EINVAL),
616             MmapError::SystemCallFailed(e) => e,
617             _ => Error::new(EIO),
618         })
619     }
620 
remove_memory_region(&mut self, slot: MemSlot) -> Result<Box<dyn MappedRegion>>621     fn remove_memory_region(&mut self, slot: MemSlot) -> Result<Box<dyn MappedRegion>> {
622         let mut regions = self.mem_regions.lock();
623         if !regions.contains_key(&slot) {
624             return Err(Error::new(ENOENT));
625         }
626         // Safe because the slot is checked against the list of memory slots.
627         unsafe {
628             set_user_memory_region(&self.vm, slot, false, false, 0, 0, std::ptr::null_mut())?;
629         }
630         self.mem_slot_gaps.lock().push(Reverse(slot));
631         // This remove will always succeed because of the contains_key check above.
632         Ok(regions.remove(&slot).unwrap())
633     }
634 
create_device(&self, kind: DeviceKind) -> Result<SafeDescriptor>635     fn create_device(&self, kind: DeviceKind) -> Result<SafeDescriptor> {
636         let device = if let Some(dev) = self.get_device_params_arch(kind) {
637             dev
638         } else {
639             match kind {
640                 DeviceKind::Vfio => kvm_create_device {
641                     type_: kvm_device_type_KVM_DEV_TYPE_VFIO,
642                     fd: 0,
643                     flags: 0,
644                 },
645 
646                 // ARM has additional DeviceKinds, so it needs the catch-all pattern
647                 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
648                 _ => return Err(Error::new(libc::ENXIO)),
649             }
650         };
651 
652         // Safe because we know that our file is a VM fd, we know the kernel will only write correct
653         // amount of memory to our pointer, and we verify the return result.
654         let ret = unsafe { base::ioctl_with_ref(self, KVM_CREATE_DEVICE(), &device) };
655         if ret == 0 {
656             // Safe because we verify that ret is valid and we own the fd.
657             Ok(unsafe { SafeDescriptor::from_raw_descriptor(device.fd as i32) })
658         } else {
659             errno_result()
660         }
661     }
662 
get_dirty_log(&self, slot: MemSlot, dirty_log: &mut [u8]) -> Result<()>663     fn get_dirty_log(&self, slot: MemSlot, dirty_log: &mut [u8]) -> Result<()> {
664         let regions = self.mem_regions.lock();
665         let mmap = regions.get(&slot).ok_or_else(|| Error::new(ENOENT))?;
666         // Ensures that there are as many bytes in dirty_log as there are pages in the mmap.
667         if dirty_log_bitmap_size(mmap.size()) > dirty_log.len() {
668             return Err(Error::new(EINVAL));
669         }
670 
671         let mut dirty_log_kvm = kvm_dirty_log {
672             slot,
673             ..Default::default()
674         };
675         dirty_log_kvm.__bindgen_anon_1.dirty_bitmap = dirty_log.as_ptr() as *mut c_void;
676         // Safe because the `dirty_bitmap` pointer assigned above is guaranteed to be valid (because
677         // it's from a slice) and we checked that it will be large enough to hold the entire log.
678         let ret = unsafe { ioctl_with_ref(self, KVM_GET_DIRTY_LOG(), &dirty_log_kvm) };
679         if ret == 0 {
680             Ok(())
681         } else {
682             errno_result()
683         }
684     }
685 
register_ioevent( &mut self, evt: &Event, addr: IoEventAddress, datamatch: Datamatch, ) -> Result<()>686     fn register_ioevent(
687         &mut self,
688         evt: &Event,
689         addr: IoEventAddress,
690         datamatch: Datamatch,
691     ) -> Result<()> {
692         self.ioeventfd(evt, addr, datamatch, false)
693     }
694 
unregister_ioevent( &mut self, evt: &Event, addr: IoEventAddress, datamatch: Datamatch, ) -> Result<()>695     fn unregister_ioevent(
696         &mut self,
697         evt: &Event,
698         addr: IoEventAddress,
699         datamatch: Datamatch,
700     ) -> Result<()> {
701         self.ioeventfd(evt, addr, datamatch, true)
702     }
703 
handle_io_events(&self, _addr: IoEventAddress, _data: &[u8]) -> Result<()>704     fn handle_io_events(&self, _addr: IoEventAddress, _data: &[u8]) -> Result<()> {
705         // KVM delivers IO events in-kernel with ioeventfds, so this is a no-op
706         Ok(())
707     }
708 
get_pvclock(&self) -> Result<ClockState>709     fn get_pvclock(&self) -> Result<ClockState> {
710         self.get_pvclock_arch()
711     }
712 
set_pvclock(&self, state: &ClockState) -> Result<()>713     fn set_pvclock(&self, state: &ClockState) -> Result<()> {
714         self.set_pvclock_arch(state)
715     }
716 
add_fd_mapping( &mut self, slot: u32, offset: usize, size: usize, fd: &dyn AsRawDescriptor, fd_offset: u64, prot: Protection, ) -> Result<()>717     fn add_fd_mapping(
718         &mut self,
719         slot: u32,
720         offset: usize,
721         size: usize,
722         fd: &dyn AsRawDescriptor,
723         fd_offset: u64,
724         prot: Protection,
725     ) -> Result<()> {
726         let mut regions = self.mem_regions.lock();
727         let region = regions.get_mut(&slot).ok_or_else(|| Error::new(EINVAL))?;
728 
729         match region.add_fd_mapping(offset, size, fd, fd_offset, prot) {
730             Ok(()) => Ok(()),
731             Err(MmapError::SystemCallFailed(e)) => Err(e),
732             Err(_) => Err(Error::new(EIO)),
733         }
734     }
735 
remove_mapping(&mut self, slot: u32, offset: usize, size: usize) -> Result<()>736     fn remove_mapping(&mut self, slot: u32, offset: usize, size: usize) -> Result<()> {
737         let mut regions = self.mem_regions.lock();
738         let region = regions.get_mut(&slot).ok_or_else(|| Error::new(EINVAL))?;
739 
740         match region.remove_mapping(offset, size) {
741             Ok(()) => Ok(()),
742             Err(MmapError::SystemCallFailed(e)) => Err(e),
743             Err(_) => Err(Error::new(EIO)),
744         }
745     }
746 
handle_inflate(&mut self, guest_address: GuestAddress, size: u64) -> Result<()>747     fn handle_inflate(&mut self, guest_address: GuestAddress, size: u64) -> Result<()> {
748         match self.guest_mem.remove_range(guest_address, size) {
749             Ok(_) => Ok(()),
750             Err(vm_memory::Error::MemoryAccess(_, MmapError::SystemCallFailed(e))) => Err(e),
751             Err(_) => Err(Error::new(EIO)),
752         }
753     }
754 
handle_deflate(&mut self, _guest_address: GuestAddress, _size: u64) -> Result<()>755     fn handle_deflate(&mut self, _guest_address: GuestAddress, _size: u64) -> Result<()> {
756         // No-op, when the guest attempts to access the pages again, Linux/KVM will provide them.
757         Ok(())
758     }
759 }
760 
761 impl AsRawDescriptor for KvmVm {
as_raw_descriptor(&self) -> RawDescriptor762     fn as_raw_descriptor(&self) -> RawDescriptor {
763         self.vm.as_raw_descriptor()
764     }
765 }
766 
767 /// A wrapper around using a KVM Vcpu.
768 pub struct KvmVcpu {
769     kvm: Kvm,
770     vm: SafeDescriptor,
771     vcpu: SafeDescriptor,
772     id: usize,
773     run_mmap: MemoryMapping,
774     vcpu_run_handle_fingerprint: Arc<AtomicU64>,
775 }
776 
777 pub(super) struct VcpuThread {
778     run: *mut kvm_run,
779     signal_num: Option<c_int>,
780 }
781 
782 thread_local!(static VCPU_THREAD: RefCell<Option<VcpuThread>> = RefCell::new(None));
783 
784 impl Vcpu for KvmVcpu {
try_clone(&self) -> Result<Self>785     fn try_clone(&self) -> Result<Self> {
786         let vm = self.vm.try_clone()?;
787         let vcpu = self.vcpu.try_clone()?;
788         let run_mmap = MemoryMappingBuilder::new(self.run_mmap.size())
789             .from_descriptor(&vcpu)
790             .build()
791             .map_err(|_| Error::new(ENOSPC))?;
792         let vcpu_run_handle_fingerprint = self.vcpu_run_handle_fingerprint.clone();
793 
794         Ok(KvmVcpu {
795             kvm: self.kvm.try_clone()?,
796             vm,
797             vcpu,
798             id: self.id,
799             run_mmap,
800             vcpu_run_handle_fingerprint,
801         })
802     }
803 
as_vcpu(&self) -> &dyn Vcpu804     fn as_vcpu(&self) -> &dyn Vcpu {
805         self
806     }
807 
808     #[allow(clippy::cast_ptr_alignment)]
take_run_handle(&self, signal_num: Option<c_int>) -> Result<VcpuRunHandle>809     fn take_run_handle(&self, signal_num: Option<c_int>) -> Result<VcpuRunHandle> {
810         fn vcpu_run_handle_drop() {
811             VCPU_THREAD.with(|v| {
812                 // This assumes that a failure in `BlockedSignal::new` means the signal is already
813                 // blocked and there it should not be unblocked on exit.
814                 let _blocked_signal = &(*v.borrow())
815                     .as_ref()
816                     .and_then(|state| state.signal_num)
817                     .map(BlockedSignal::new);
818 
819                 *v.borrow_mut() = None;
820             });
821         }
822 
823         // Prevent `vcpu_run_handle_drop` from being called until we actually setup the signal
824         // blocking. The handle needs to be made now so that we can use the fingerprint.
825         let vcpu_run_handle = ManuallyDrop::new(VcpuRunHandle::new(vcpu_run_handle_drop));
826 
827         // AcqRel ordering is sufficient to ensure only one thread gets to set its fingerprint to
828         // this Vcpu and subsequent `run` calls will see the fingerprint.
829         if self
830             .vcpu_run_handle_fingerprint
831             .compare_exchange(
832                 0,
833                 vcpu_run_handle.fingerprint().as_u64(),
834                 std::sync::atomic::Ordering::AcqRel,
835                 std::sync::atomic::Ordering::Acquire,
836             )
837             .is_err()
838         {
839             return Err(Error::new(EBUSY));
840         }
841 
842         // Block signal while we add -- if a signal fires (very unlikely,
843         // as this means something is trying to pause the vcpu before it has
844         // even started) it'll try to grab the read lock while this write
845         // lock is grabbed and cause a deadlock.
846         // Assuming that a failure to block means it's already blocked.
847         let _blocked_signal = signal_num.map(BlockedSignal::new);
848 
849         VCPU_THREAD.with(|v| {
850             if v.borrow().is_none() {
851                 *v.borrow_mut() = Some(VcpuThread {
852                     run: self.run_mmap.as_ptr() as *mut kvm_run,
853                     signal_num,
854                 });
855                 Ok(())
856             } else {
857                 Err(Error::new(EBUSY))
858             }
859         })?;
860 
861         Ok(ManuallyDrop::into_inner(vcpu_run_handle))
862     }
863 
id(&self) -> usize864     fn id(&self) -> usize {
865         self.id
866     }
867 
868     #[allow(clippy::cast_ptr_alignment)]
set_immediate_exit(&self, exit: bool)869     fn set_immediate_exit(&self, exit: bool) {
870         // Safe because we know we mapped enough memory to hold the kvm_run struct because the
871         // kernel told us how large it was. The pointer is page aligned so casting to a different
872         // type is well defined, hence the clippy allow attribute.
873         let run = unsafe { &mut *(self.run_mmap.as_ptr() as *mut kvm_run) };
874         run.immediate_exit = exit.into();
875     }
876 
set_local_immediate_exit(exit: bool)877     fn set_local_immediate_exit(exit: bool) {
878         VCPU_THREAD.with(|v| {
879             if let Some(state) = &(*v.borrow()) {
880                 unsafe {
881                     (*state.run).immediate_exit = exit.into();
882                 };
883             }
884         });
885     }
886 
set_local_immediate_exit_fn(&self) -> extern "C" fn()887     fn set_local_immediate_exit_fn(&self) -> extern "C" fn() {
888         extern "C" fn f() {
889             KvmVcpu::set_local_immediate_exit(true);
890         }
891         f
892     }
893 
pvclock_ctrl(&self) -> Result<()>894     fn pvclock_ctrl(&self) -> Result<()> {
895         self.pvclock_ctrl_arch()
896     }
897 
set_signal_mask(&self, signals: &[c_int]) -> Result<()>898     fn set_signal_mask(&self, signals: &[c_int]) -> Result<()> {
899         let sigset = signal::create_sigset(signals)?;
900 
901         let mut kvm_sigmask = vec_with_array_field::<kvm_signal_mask, sigset_t>(1);
902         // Rust definition of sigset_t takes 128 bytes, but the kernel only
903         // expects 8-bytes structure, so we can't write
904         // kvm_sigmask.len  = size_of::<sigset_t>() as u32;
905         kvm_sigmask[0].len = 8;
906         // Ensure the length is not too big.
907         const _ASSERT: usize = size_of::<sigset_t>() - 8usize;
908 
909         // Safe as we allocated exactly the needed space
910         unsafe {
911             copy_nonoverlapping(
912                 &sigset as *const sigset_t as *const u8,
913                 kvm_sigmask[0].sigset.as_mut_ptr(),
914                 8,
915             );
916         }
917 
918         let ret = unsafe {
919             // The ioctl is safe because the kernel will only read from the
920             // kvm_signal_mask structure.
921             ioctl_with_ref(self, KVM_SET_SIGNAL_MASK(), &kvm_sigmask[0])
922         };
923         if ret == 0 {
924             Ok(())
925         } else {
926             errno_result()
927         }
928     }
929 
enable_raw_capability(&self, cap: u32, args: &[u64; 4]) -> Result<()>930     unsafe fn enable_raw_capability(&self, cap: u32, args: &[u64; 4]) -> Result<()> {
931         let kvm_cap = kvm_enable_cap {
932             cap,
933             args: *args,
934             ..Default::default()
935         };
936         // Safe because we allocated the struct and we know the kernel will read exactly the size of
937         // the struct, and because we assume the caller has allocated the args appropriately.
938         let ret = ioctl_with_ref(self, KVM_ENABLE_CAP(), &kvm_cap);
939         if ret == 0 {
940             Ok(())
941         } else {
942             errno_result()
943         }
944     }
945 
946     #[allow(clippy::cast_ptr_alignment)]
947     // The pointer is page aligned so casting to a different type is well defined, hence the clippy
948     // allow attribute.
run(&mut self, run_handle: &VcpuRunHandle) -> Result<VcpuExit>949     fn run(&mut self, run_handle: &VcpuRunHandle) -> Result<VcpuExit> {
950         // Acquire is used to ensure this check is ordered after the `compare_exchange` in `run`.
951         if self
952             .vcpu_run_handle_fingerprint
953             .load(std::sync::atomic::Ordering::Acquire)
954             != run_handle.fingerprint().as_u64()
955         {
956             panic!("invalid VcpuRunHandle used to run Vcpu");
957         }
958 
959         // Safe because we know that our file is a VCPU fd and we verify the return result.
960         let ret = unsafe { ioctl(self, KVM_RUN()) };
961         if ret != 0 {
962             return errno_result();
963         }
964 
965         // Safe because we know we mapped enough memory to hold the kvm_run struct because the
966         // kernel told us how large it was.
967         let run = unsafe { &mut *(self.run_mmap.as_ptr() as *mut kvm_run) };
968         match run.exit_reason {
969             KVM_EXIT_IO => Ok(VcpuExit::Io),
970             KVM_EXIT_MMIO => Ok(VcpuExit::Mmio),
971             KVM_EXIT_IOAPIC_EOI => {
972                 // Safe because the exit_reason (which comes from the kernel) told us which
973                 // union field to use.
974                 let vector = unsafe { run.__bindgen_anon_1.eoi.vector };
975                 Ok(VcpuExit::IoapicEoi { vector })
976             }
977             KVM_EXIT_HYPERV => Ok(VcpuExit::HypervHypercall),
978             KVM_EXIT_UNKNOWN => Ok(VcpuExit::Unknown),
979             KVM_EXIT_EXCEPTION => Ok(VcpuExit::Exception),
980             KVM_EXIT_HYPERCALL => Ok(VcpuExit::Hypercall),
981             KVM_EXIT_DEBUG => Ok(VcpuExit::Debug),
982             KVM_EXIT_HLT => Ok(VcpuExit::Hlt),
983             KVM_EXIT_IRQ_WINDOW_OPEN => Ok(VcpuExit::IrqWindowOpen),
984             KVM_EXIT_SHUTDOWN => Ok(VcpuExit::Shutdown),
985             KVM_EXIT_FAIL_ENTRY => {
986                 // Safe because the exit_reason (which comes from the kernel) told us which
987                 // union field to use.
988                 let hardware_entry_failure_reason = unsafe {
989                     run.__bindgen_anon_1
990                         .fail_entry
991                         .hardware_entry_failure_reason
992                 };
993                 Ok(VcpuExit::FailEntry {
994                     hardware_entry_failure_reason,
995                 })
996             }
997             KVM_EXIT_INTR => Ok(VcpuExit::Intr),
998             KVM_EXIT_SET_TPR => Ok(VcpuExit::SetTpr),
999             KVM_EXIT_TPR_ACCESS => Ok(VcpuExit::TprAccess),
1000             KVM_EXIT_S390_SIEIC => Ok(VcpuExit::S390Sieic),
1001             KVM_EXIT_S390_RESET => Ok(VcpuExit::S390Reset),
1002             KVM_EXIT_DCR => Ok(VcpuExit::Dcr),
1003             KVM_EXIT_NMI => Ok(VcpuExit::Nmi),
1004             KVM_EXIT_INTERNAL_ERROR => Ok(VcpuExit::InternalError),
1005             KVM_EXIT_OSI => Ok(VcpuExit::Osi),
1006             KVM_EXIT_PAPR_HCALL => Ok(VcpuExit::PaprHcall),
1007             KVM_EXIT_S390_UCONTROL => Ok(VcpuExit::S390Ucontrol),
1008             KVM_EXIT_WATCHDOG => Ok(VcpuExit::Watchdog),
1009             KVM_EXIT_S390_TSCH => Ok(VcpuExit::S390Tsch),
1010             KVM_EXIT_EPR => Ok(VcpuExit::Epr),
1011             KVM_EXIT_SYSTEM_EVENT => {
1012                 // Safe because we know the exit reason told us this union
1013                 // field is valid
1014                 let event_type = unsafe { run.__bindgen_anon_1.system_event.type_ };
1015                 let event_flags =
1016                     unsafe { run.__bindgen_anon_1.system_event.__bindgen_anon_1.flags };
1017                 match event_type {
1018                     KVM_SYSTEM_EVENT_SHUTDOWN => Ok(VcpuExit::SystemEventShutdown),
1019                     KVM_SYSTEM_EVENT_RESET => self.system_event_reset(event_flags),
1020                     KVM_SYSTEM_EVENT_CRASH => Ok(VcpuExit::SystemEventCrash),
1021                     KVM_SYSTEM_EVENT_S2IDLE => Ok(VcpuExit::SystemEventS2Idle),
1022                     _ => {
1023                         error!(
1024                             "Unknown KVM system event {} with flags {}",
1025                             event_type, event_flags
1026                         );
1027                         Err(Error::new(EINVAL))
1028                     }
1029                 }
1030             }
1031             KVM_EXIT_X86_RDMSR => {
1032                 // Safe because the exit_reason (which comes from the kernel) told us which
1033                 // union field to use.
1034                 let msr = unsafe { &mut run.__bindgen_anon_1.msr };
1035                 let index = msr.index;
1036                 // By default fail the MSR read unless it was handled later.
1037                 msr.error = 1;
1038                 Ok(VcpuExit::RdMsr { index })
1039             }
1040             KVM_EXIT_X86_WRMSR => {
1041                 // Safe because the exit_reason (which comes from the kernel) told us which
1042                 // union field to use.
1043                 let msr = unsafe { &mut run.__bindgen_anon_1.msr };
1044                 // By default fail the MSR write.
1045                 msr.error = 1;
1046                 let index = msr.index;
1047                 let data = msr.data;
1048                 Ok(VcpuExit::WrMsr { index, data })
1049             }
1050             KVM_EXIT_X86_BUS_LOCK => Ok(VcpuExit::BusLock),
1051             r => panic!("unknown kvm exit reason: {}", r),
1052         }
1053     }
1054 
handle_mmio(&self, handle_fn: &mut dyn FnMut(IoParams) -> Option<[u8; 8]>) -> Result<()>1055     fn handle_mmio(&self, handle_fn: &mut dyn FnMut(IoParams) -> Option<[u8; 8]>) -> Result<()> {
1056         // Safe because we know we mapped enough memory to hold the kvm_run struct because the
1057         // kernel told us how large it was. The pointer is page aligned so casting to a different
1058         // type is well defined, hence the clippy allow attribute.
1059         let run = unsafe { &mut *(self.run_mmap.as_ptr() as *mut kvm_run) };
1060         // Verify that the handler is called in the right context.
1061         assert!(run.exit_reason == KVM_EXIT_MMIO);
1062         // Safe because the exit_reason (which comes from the kernel) told us which
1063         // union field to use.
1064         let mmio = unsafe { &mut run.__bindgen_anon_1.mmio };
1065         let address = mmio.phys_addr;
1066         let size = min(mmio.len as usize, mmio.data.len());
1067         if mmio.is_write != 0 {
1068             handle_fn(IoParams {
1069                 address,
1070                 size,
1071                 operation: IoOperation::Write { data: mmio.data },
1072             });
1073             Ok(())
1074         } else if let Some(data) = handle_fn(IoParams {
1075             address,
1076             size,
1077             operation: IoOperation::Read,
1078         }) {
1079             mmio.data[..size].copy_from_slice(&data[..size]);
1080             Ok(())
1081         } else {
1082             Err(Error::new(EINVAL))
1083         }
1084     }
1085 
handle_io(&self, handle_fn: &mut dyn FnMut(IoParams) -> Option<[u8; 8]>) -> Result<()>1086     fn handle_io(&self, handle_fn: &mut dyn FnMut(IoParams) -> Option<[u8; 8]>) -> Result<()> {
1087         // Safe because we know we mapped enough memory to hold the kvm_run struct because the
1088         // kernel told us how large it was. The pointer is page aligned so casting to a different
1089         // type is well defined, hence the clippy allow attribute.
1090         let run = unsafe { &mut *(self.run_mmap.as_ptr() as *mut kvm_run) };
1091         // Verify that the handler is called in the right context.
1092         assert!(run.exit_reason == KVM_EXIT_IO);
1093         let run_start = run as *mut kvm_run as *mut u8;
1094         // Safe because the exit_reason (which comes from the kernel) told us which
1095         // union field to use.
1096         let io = unsafe { run.__bindgen_anon_1.io };
1097         let size = (io.count as usize) * (io.size as usize);
1098         match io.direction as u32 {
1099             KVM_EXIT_IO_IN => {
1100                 if let Some(data) = handle_fn(IoParams {
1101                     address: io.port.into(),
1102                     size,
1103                     operation: IoOperation::Read,
1104                 }) {
1105                     // The data_offset is defined by the kernel to be some number of bytes
1106                     // into the kvm_run structure, which we have fully mmap'd.
1107                     unsafe {
1108                         let data_ptr = run_start.offset(io.data_offset as isize);
1109                         copy_nonoverlapping(data.as_ptr(), data_ptr, size);
1110                     }
1111                     Ok(())
1112                 } else {
1113                     Err(Error::new(EINVAL))
1114                 }
1115             }
1116             KVM_EXIT_IO_OUT => {
1117                 let mut data = [0; 8];
1118                 // The data_offset is defined by the kernel to be some number of bytes
1119                 // into the kvm_run structure, which we have fully mmap'd.
1120                 unsafe {
1121                     let data_ptr = run_start.offset(io.data_offset as isize);
1122                     copy_nonoverlapping(data_ptr, data.as_mut_ptr(), min(size, data.len()));
1123                 }
1124                 handle_fn(IoParams {
1125                     address: io.port.into(),
1126                     size,
1127                     operation: IoOperation::Write { data },
1128                 });
1129                 Ok(())
1130             }
1131             _ => Err(Error::new(EINVAL)),
1132         }
1133     }
1134 
handle_hyperv_hypercall( &self, handle_fn: &mut dyn FnMut(HypervHypercall) -> u64, ) -> Result<()>1135     fn handle_hyperv_hypercall(
1136         &self,
1137         handle_fn: &mut dyn FnMut(HypervHypercall) -> u64,
1138     ) -> Result<()> {
1139         // Safe because we know we mapped enough memory to hold the kvm_run struct because the
1140         // kernel told us how large it was.
1141         let run = unsafe { &mut *(self.run_mmap.as_ptr() as *mut kvm_run) };
1142         // Verify that the handler is called in the right context.
1143         assert!(run.exit_reason == KVM_EXIT_HYPERV);
1144         // Safe because the exit_reason (which comes from the kernel) told us which
1145         // union field to use.
1146         let hyperv = unsafe { &mut run.__bindgen_anon_1.hyperv };
1147         match hyperv.type_ as u32 {
1148             KVM_EXIT_HYPERV_SYNIC => {
1149                 let synic = unsafe { &hyperv.u.synic };
1150                 handle_fn(HypervHypercall::HypervSynic {
1151                     msr: synic.msr,
1152                     control: synic.control,
1153                     evt_page: synic.evt_page,
1154                     msg_page: synic.msg_page,
1155                 });
1156                 Ok(())
1157             }
1158             KVM_EXIT_HYPERV_HCALL => {
1159                 let hcall = unsafe { &mut hyperv.u.hcall };
1160                 hcall.result = handle_fn(HypervHypercall::HypervHcall {
1161                     input: hcall.input,
1162                     params: hcall.params,
1163                 });
1164                 Ok(())
1165             }
1166             _ => Err(Error::new(EINVAL)),
1167         }
1168     }
1169 
handle_rdmsr(&self, data: u64) -> Result<()>1170     fn handle_rdmsr(&self, data: u64) -> Result<()> {
1171         // Safe because we know we mapped enough memory to hold the kvm_run struct because the
1172         // kernel told us how large it was.
1173         let run = unsafe { &mut *(self.run_mmap.as_ptr() as *mut kvm_run) };
1174         // Verify that the handler is called in the right context.
1175         assert!(run.exit_reason == KVM_EXIT_X86_RDMSR);
1176         // Safe because the exit_reason (which comes from the kernel) told us which
1177         // union field to use.
1178         let msr = unsafe { &mut run.__bindgen_anon_1.msr };
1179         msr.data = data;
1180         msr.error = 0;
1181         Ok(())
1182     }
1183 
handle_wrmsr(&self)1184     fn handle_wrmsr(&self) {
1185         // Safe because we know we mapped enough memory to hold the kvm_run struct because the
1186         // kernel told us how large it was.
1187         let run = unsafe { &mut *(self.run_mmap.as_ptr() as *mut kvm_run) };
1188         // Verify that the handler is called in the right context.
1189         assert!(run.exit_reason == KVM_EXIT_X86_WRMSR);
1190         // Safe because the exit_reason (which comes from the kernel) told us which
1191         // union field to use.
1192         let msr = unsafe { &mut run.__bindgen_anon_1.msr };
1193         msr.error = 0;
1194     }
1195 }
1196 
1197 impl KvmVcpu {
1198     /// Gets the vcpu's current "multiprocessing state".
1199     ///
1200     /// See the documentation for KVM_GET_MP_STATE. This call can only succeed after
1201     /// a call to `Vm::create_irq_chip`.
1202     ///
1203     /// Note that KVM defines the call for both x86 and s390 but we do not expect anyone
1204     /// to run crosvm on s390.
get_mp_state(&self) -> Result<kvm_mp_state>1205     pub fn get_mp_state(&self) -> Result<kvm_mp_state> {
1206         // Safe because we know that our file is a VCPU fd, we know the kernel will only write the
1207         // correct amount of memory to our pointer, and we verify the return result.
1208         let mut state: kvm_mp_state = unsafe { std::mem::zeroed() };
1209         let ret = unsafe { ioctl_with_mut_ref(self, KVM_GET_MP_STATE(), &mut state) };
1210         if ret < 0 {
1211             return errno_result();
1212         }
1213         Ok(state)
1214     }
1215 
1216     /// Sets the vcpu's current "multiprocessing state".
1217     ///
1218     /// See the documentation for KVM_SET_MP_STATE. This call can only succeed after
1219     /// a call to `Vm::create_irq_chip`.
1220     ///
1221     /// Note that KVM defines the call for both x86 and s390 but we do not expect anyone
1222     /// to run crosvm on s390.
set_mp_state(&self, state: &kvm_mp_state) -> Result<()>1223     pub fn set_mp_state(&self, state: &kvm_mp_state) -> Result<()> {
1224         let ret = unsafe {
1225             // The ioctl is safe because the kernel will only read from the kvm_mp_state struct.
1226             ioctl_with_ref(self, KVM_SET_MP_STATE(), state)
1227         };
1228         if ret < 0 {
1229             return errno_result();
1230         }
1231         Ok(())
1232     }
1233 }
1234 
1235 impl AsRawDescriptor for KvmVcpu {
as_raw_descriptor(&self) -> RawDescriptor1236     fn as_raw_descriptor(&self) -> RawDescriptor {
1237         self.vcpu.as_raw_descriptor()
1238     }
1239 }
1240 
1241 impl TryFrom<HypervisorCap> for KvmCap {
1242     type Error = Error;
1243 
try_from(cap: HypervisorCap) -> Result<KvmCap>1244     fn try_from(cap: HypervisorCap) -> Result<KvmCap> {
1245         match cap {
1246             HypervisorCap::ArmPmuV3 => Ok(KvmCap::ArmPmuV3),
1247             HypervisorCap::ImmediateExit => Ok(KvmCap::ImmediateExit),
1248             HypervisorCap::S390UserSigp => Ok(KvmCap::S390UserSigp),
1249             HypervisorCap::TscDeadlineTimer => Ok(KvmCap::TscDeadlineTimer),
1250             HypervisorCap::UserMemory => Ok(KvmCap::UserMemory),
1251             #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
1252             HypervisorCap::Xcrs => Ok(KvmCap::Xcrs),
1253             #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
1254             HypervisorCap::CalibratedTscLeafRequired => Err(Error::new(libc::EINVAL)),
1255             HypervisorCap::StaticSwiotlbAllocationRequired => Err(Error::new(libc::EINVAL)),
1256             HypervisorCap::HypervisorInitializedBootContext => Err(Error::new(libc::EINVAL)),
1257         }
1258     }
1259 }
1260 
1261 impl From<&IrqRoute> for kvm_irq_routing_entry {
from(item: &IrqRoute) -> Self1262     fn from(item: &IrqRoute) -> Self {
1263         match &item.source {
1264             IrqSource::Irqchip { chip, pin } => kvm_irq_routing_entry {
1265                 gsi: item.gsi,
1266                 type_: KVM_IRQ_ROUTING_IRQCHIP,
1267                 u: kvm_irq_routing_entry__bindgen_ty_1 {
1268                     irqchip: kvm_irq_routing_irqchip {
1269                         irqchip: chip_to_kvm_chip(*chip),
1270                         pin: *pin,
1271                     },
1272                 },
1273                 ..Default::default()
1274             },
1275             IrqSource::Msi { address, data } => kvm_irq_routing_entry {
1276                 gsi: item.gsi,
1277                 type_: KVM_IRQ_ROUTING_MSI,
1278                 u: kvm_irq_routing_entry__bindgen_ty_1 {
1279                     msi: kvm_irq_routing_msi {
1280                         address_lo: *address as u32,
1281                         address_hi: (*address >> 32) as u32,
1282                         data: *data,
1283                         ..Default::default()
1284                     },
1285                 },
1286                 ..Default::default()
1287             },
1288         }
1289     }
1290 }
1291 
1292 impl From<&kvm_mp_state> for MPState {
from(item: &kvm_mp_state) -> Self1293     fn from(item: &kvm_mp_state) -> Self {
1294         match item.mp_state {
1295             KVM_MP_STATE_RUNNABLE => MPState::Runnable,
1296             KVM_MP_STATE_UNINITIALIZED => MPState::Uninitialized,
1297             KVM_MP_STATE_INIT_RECEIVED => MPState::InitReceived,
1298             KVM_MP_STATE_HALTED => MPState::Halted,
1299             KVM_MP_STATE_SIPI_RECEIVED => MPState::SipiReceived,
1300             KVM_MP_STATE_STOPPED => MPState::Stopped,
1301             state => {
1302                 error!(
1303                     "unrecognized kvm_mp_state {}, setting to KVM_MP_STATE_RUNNABLE",
1304                     state
1305                 );
1306                 MPState::Runnable
1307             }
1308         }
1309     }
1310 }
1311 
1312 impl From<&MPState> for kvm_mp_state {
from(item: &MPState) -> Self1313     fn from(item: &MPState) -> Self {
1314         kvm_mp_state {
1315             mp_state: match item {
1316                 MPState::Runnable => KVM_MP_STATE_RUNNABLE,
1317                 MPState::Uninitialized => KVM_MP_STATE_UNINITIALIZED,
1318                 MPState::InitReceived => KVM_MP_STATE_INIT_RECEIVED,
1319                 MPState::Halted => KVM_MP_STATE_HALTED,
1320                 MPState::SipiReceived => KVM_MP_STATE_SIPI_RECEIVED,
1321                 MPState::Stopped => KVM_MP_STATE_STOPPED,
1322             },
1323         }
1324     }
1325 }
1326