// Copyright 2017 The Chromium OS Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. //! A safe wrapper around the kernel's KVM interface. mod cap; use std::cell::RefCell; use std::cmp::{min, Ordering}; use std::collections::{BTreeMap, BinaryHeap}; use std::ffi::CString; use std::fs::File; use std::mem::size_of; use std::ops::{Deref, DerefMut}; use std::os::raw::*; use std::os::unix::prelude::OsStrExt; use std::path::{Path, PathBuf}; use std::ptr::copy_nonoverlapping; use std::sync::Arc; use sync::Mutex; use base::{AsRawDescriptor, FromRawDescriptor, RawDescriptor}; use data_model::vec_with_array_field; #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] use data_model::FlexibleArrayWrapper; use libc::sigset_t; use libc::{open, EBUSY, EINVAL, ENOENT, ENOSPC, EOVERFLOW, O_CLOEXEC, O_RDWR}; use kvm_sys::*; #[allow(unused_imports)] use base::{ block_signal, ioctl, ioctl_with_mut_ptr, ioctl_with_mut_ref, ioctl_with_ptr, ioctl_with_ref, ioctl_with_val, pagesize, signal, unblock_signal, warn, Error, Event, IoctlNr, MappedRegion, MemoryMapping, MemoryMappingBuilder, MmapError, Result, SIGRTMIN, }; use vm_memory::{GuestAddress, GuestMemory}; pub use crate::cap::*; fn errno_result() -> Result { Err(Error::last()) } unsafe fn set_user_memory_region( fd: &F, slot: u32, read_only: bool, log_dirty_pages: bool, guest_addr: u64, memory_size: u64, userspace_addr: *mut u8, ) -> Result<()> { let mut flags = if read_only { KVM_MEM_READONLY } else { 0 }; if log_dirty_pages { flags |= KVM_MEM_LOG_DIRTY_PAGES; } let region = kvm_userspace_memory_region { slot, flags, guest_phys_addr: guest_addr, memory_size, userspace_addr: userspace_addr as u64, }; let ret = ioctl_with_ref(fd, KVM_SET_USER_MEMORY_REGION(), ®ion); if ret == 0 { Ok(()) } else { errno_result() } } /// Helper function to determine the size in bytes of a dirty log bitmap for the given memory region /// size. /// /// # Arguments /// /// * `size` - Number of bytes in the memory region being queried. pub fn dirty_log_bitmap_size(size: usize) -> usize { let page_size = pagesize(); (((size + page_size - 1) / page_size) + 7) / 8 } /// A wrapper around opening and using `/dev/kvm`. /// /// Useful for querying extensions and basic values from the KVM backend. A `Kvm` is required to /// create a `Vm` object. pub struct Kvm { kvm: File, } impl Kvm { /// Opens `/dev/kvm/` and returns a Kvm object on success. pub fn new() -> Result { Kvm::new_with_path(&PathBuf::from("/dev/kvm")) } /// Opens a KVM device at `device_path` and returns a Kvm object on success. pub fn new_with_path(device_path: &Path) -> Result { // Open calls are safe because we give a nul-terminated string and verify the result. let c_path = CString::new(device_path.as_os_str().as_bytes()).unwrap(); let ret = unsafe { open(c_path.as_ptr(), O_RDWR | O_CLOEXEC) }; if ret < 0 { return errno_result(); } // Safe because we verify that ret is valid and we own the fd. Ok(Kvm { kvm: unsafe { File::from_raw_descriptor(ret) }, }) } fn check_extension_int(&self, c: Cap) -> i32 { // Safe because we know that our file is a KVM fd and that the extension is one of the ones // defined by kernel. unsafe { ioctl_with_val(self, KVM_CHECK_EXTENSION(), c as c_ulong) } } /// Checks if a particular `Cap` is available. pub fn check_extension(&self, c: Cap) -> bool { self.check_extension_int(c) == 1 } /// Gets the size of the mmap required to use vcpu's `kvm_run` structure. pub fn get_vcpu_mmap_size(&self) -> Result { // Safe because we know that our file is a KVM fd and we verify the return result. let res = unsafe { ioctl(self, KVM_GET_VCPU_MMAP_SIZE()) }; if res > 0 { Ok(res as usize) } else { errno_result() } } #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] fn get_cpuid(&self, kind: IoctlNr) -> Result { const MAX_KVM_CPUID_ENTRIES: usize = 256; let mut cpuid = CpuId::new(MAX_KVM_CPUID_ENTRIES); let ret = unsafe { // ioctl is unsafe. The kernel is trusted not to write beyond the bounds of the memory // allocated for the struct. The limit is read from nent, which is set to the allocated // size(MAX_KVM_CPUID_ENTRIES) above. ioctl_with_mut_ptr(self, kind, cpuid.as_mut_ptr()) }; if ret < 0 { return errno_result(); } Ok(cpuid) } /// X86 specific call to get the system supported CPUID values #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] pub fn get_supported_cpuid(&self) -> Result { self.get_cpuid(KVM_GET_SUPPORTED_CPUID()) } /// X86 specific call to get the system emulated CPUID values #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] pub fn get_emulated_cpuid(&self) -> Result { self.get_cpuid(KVM_GET_EMULATED_CPUID()) } /// X86 specific call to get list of supported MSRS /// /// See the documentation for KVM_GET_MSR_INDEX_LIST. #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] pub fn get_msr_index_list(&self) -> Result> { const MAX_KVM_MSR_ENTRIES: usize = 256; let mut msr_list = vec_with_array_field::(MAX_KVM_MSR_ENTRIES); msr_list[0].nmsrs = MAX_KVM_MSR_ENTRIES as u32; let ret = unsafe { // ioctl is unsafe. The kernel is trusted not to write beyond the bounds of the memory // allocated for the struct. The limit is read from nmsrs, which is set to the allocated // size (MAX_KVM_MSR_ENTRIES) above. ioctl_with_mut_ref(self, KVM_GET_MSR_INDEX_LIST(), &mut msr_list[0]) }; if ret < 0 { return errno_result(); } let mut nmsrs = msr_list[0].nmsrs; // Mapping the unsized array to a slice is unsafe because the length isn't known. Using // the length we originally allocated with eliminates the possibility of overflow. let indices: &[u32] = unsafe { if nmsrs > MAX_KVM_MSR_ENTRIES as u32 { nmsrs = MAX_KVM_MSR_ENTRIES as u32; } msr_list[0].indices.as_slice(nmsrs as usize) }; Ok(indices.to_vec()) } } impl AsRawDescriptor for Kvm { fn as_raw_descriptor(&self) -> RawDescriptor { self.kvm.as_raw_descriptor() } } /// An address either in programmable I/O space or in memory mapped I/O space. #[derive(Copy, Clone, Debug)] pub enum IoeventAddress { Pio(u64), Mmio(u64), } /// Used in `Vm::register_ioevent` to indicate a size and optionally value to match. pub enum Datamatch { AnyLength, U8(Option), U16(Option), U32(Option), U64(Option), } /// A source of IRQs in an `IrqRoute`. pub enum IrqSource { Irqchip { chip: u32, pin: u32 }, Msi { address: u64, data: u32 }, } /// A single route for an IRQ. pub struct IrqRoute { pub gsi: u32, pub source: IrqSource, } /// Interrupt controller IDs pub enum PicId { Primary = 0, Secondary = 1, } /// Number of pins on the IOAPIC. pub const NUM_IOAPIC_PINS: usize = 24; // Used to invert the order when stored in a max-heap. #[derive(Copy, Clone, Eq, PartialEq)] struct MemSlot(u32); impl Ord for MemSlot { fn cmp(&self, other: &MemSlot) -> Ordering { // Notice the order is inverted so the lowest magnitude slot has the highest priority in a // max-heap. other.0.cmp(&self.0) } } impl PartialOrd for MemSlot { fn partial_cmp(&self, other: &MemSlot) -> Option { Some(self.cmp(other)) } } /// A wrapper around creating and using a VM. pub struct Vm { vm: File, guest_mem: GuestMemory, mem_regions: Arc>>>, mem_slot_gaps: Arc>>, } impl Vm { /// Constructs a new `Vm` using the given `Kvm` instance. pub fn new(kvm: &Kvm, guest_mem: GuestMemory) -> Result { // Safe because we know kvm is a real kvm fd as this module is the only one that can make // Kvm objects. let ret = unsafe { ioctl(kvm, KVM_CREATE_VM()) }; if ret >= 0 { // Safe because we verify the value of ret and we are the owners of the fd. let vm_file = unsafe { File::from_raw_descriptor(ret) }; guest_mem.with_regions(|index, guest_addr, size, host_addr, _, _| { unsafe { // Safe because the guest regions are guaranteed not to overlap. set_user_memory_region( &vm_file, index as u32, false, false, guest_addr.offset() as u64, size as u64, host_addr as *mut u8, ) } })?; Ok(Vm { vm: vm_file, guest_mem, mem_regions: Arc::new(Mutex::new(BTreeMap::new())), mem_slot_gaps: Arc::new(Mutex::new(BinaryHeap::new())), }) } else { errno_result() } } /// Checks if a particular `Cap` is available. /// /// This is distinct from the `Kvm` version of this method because the some extensions depend on /// the particular `Vm` existence. This method is encouraged by the kernel because it more /// accurately reflects the usable capabilities. pub fn check_extension(&self, c: Cap) -> bool { // Safe because we know that our file is a KVM fd and that the extension is one of the ones // defined by kernel. unsafe { ioctl_with_val(self, KVM_CHECK_EXTENSION(), c as c_ulong) == 1 } } /// Inserts the given `mem` into the VM's address space at `guest_addr`. /// /// The slot that was assigned the kvm memory mapping is returned on success. The slot can be /// given to `Vm::remove_memory_region` to remove the memory from the VM's address space and /// take back ownership of `mem`. /// /// Note that memory inserted into the VM's address space must not overlap with any other memory /// slot's region. /// /// If `read_only` is true, the guest will be able to read the memory as normal, but attempts to /// write will trigger a mmio VM exit, leaving the memory untouched. /// /// If `log_dirty_pages` is true, the slot number can be used to retrieve the pages written to /// by the guest with `get_dirty_log`. pub fn add_memory_region( &mut self, guest_addr: GuestAddress, mem: Box, read_only: bool, log_dirty_pages: bool, ) -> Result { let size = mem.size() as u64; let end_addr = guest_addr .checked_add(size) .ok_or_else(|| Error::new(EOVERFLOW))?; if self.guest_mem.range_overlap(guest_addr, end_addr) { return Err(Error::new(ENOSPC)); } let mut regions = self.mem_regions.lock(); let mut gaps = self.mem_slot_gaps.lock(); let slot = match gaps.pop() { Some(gap) => gap.0, None => (regions.len() + self.guest_mem.num_regions() as usize) as u32, }; // Safe because we check that the given guest address is valid and has no overlaps. We also // know that the pointer and size are correct because the MemoryMapping interface ensures // this. We take ownership of the memory mapping so that it won't be unmapped until the slot // is removed. let res = unsafe { set_user_memory_region( &self.vm, slot, read_only, log_dirty_pages, guest_addr.offset() as u64, size, mem.as_ptr(), ) }; if let Err(e) = res { gaps.push(MemSlot(slot)); return Err(e); } regions.insert(slot, mem); Ok(slot) } /// Removes memory that was previously added at the given slot. /// /// Ownership of the host memory mapping associated with the given slot is returned on success. pub fn remove_memory_region(&mut self, slot: u32) -> Result> { let mut regions = self.mem_regions.lock(); if !regions.contains_key(&slot) { return Err(Error::new(ENOENT)); } // Safe because the slot is checked against the list of memory slots. unsafe { set_user_memory_region(&self.vm, slot, false, false, 0, 0, std::ptr::null_mut())?; } self.mem_slot_gaps.lock().push(MemSlot(slot)); // This remove will always succeed because of the contains_key check above. Ok(regions.remove(&slot).unwrap()) } /// Gets the bitmap of dirty pages since the last call to `get_dirty_log` for the memory at /// `slot`. /// /// The size of `dirty_log` must be at least as many bits as there are pages in the memory /// region `slot` represents. For example, if the size of `slot` is 16 pages, `dirty_log` must /// be 2 bytes or greater. pub fn get_dirty_log(&self, slot: u32, dirty_log: &mut [u8]) -> Result<()> { match self.mem_regions.lock().get(&slot) { Some(mem) => { // Ensures that there are as many bytes in dirty_log as there are pages in the mmap. if dirty_log_bitmap_size(mem.size()) > dirty_log.len() { return Err(Error::new(EINVAL)); } let mut dirty_log_kvm = kvm_dirty_log { slot, ..Default::default() }; dirty_log_kvm.__bindgen_anon_1.dirty_bitmap = dirty_log.as_ptr() as *mut c_void; // Safe because the `dirty_bitmap` pointer assigned above is guaranteed to be valid // (because it's from a slice) and we checked that it will be large enough to hold // the entire log. let ret = unsafe { ioctl_with_ref(self, KVM_GET_DIRTY_LOG(), &dirty_log_kvm) }; if ret == 0 { Ok(()) } else { errno_result() } } _ => Err(Error::new(ENOENT)), } } /// Gets a reference to the guest memory owned by this VM. /// /// Note that `GuestMemory` does not include any mmio memory that may have been added after /// this VM was constructed. pub fn get_memory(&self) -> &GuestMemory { &self.guest_mem } /// Sets the address of a one-page region in the VM's address space. /// /// See the documentation on the KVM_SET_IDENTITY_MAP_ADDR ioctl. #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] pub fn set_identity_map_addr(&self, addr: GuestAddress) -> Result<()> { // Safe because we know that our file is a VM fd and we verify the return result. let ret = unsafe { ioctl_with_ref(self, KVM_SET_IDENTITY_MAP_ADDR(), &(addr.offset() as u64)) }; if ret == 0 { Ok(()) } else { errno_result() } } /// Retrieves the current timestamp of kvmclock as seen by the current guest. /// /// See the documentation on the KVM_GET_CLOCK ioctl. #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] pub fn get_clock(&self) -> Result { // Safe because we know that our file is a VM fd, we know the kernel will only write // correct amount of memory to our pointer, and we verify the return result. let mut clock_data = unsafe { std::mem::zeroed() }; let ret = unsafe { ioctl_with_mut_ref(self, KVM_GET_CLOCK(), &mut clock_data) }; if ret == 0 { Ok(clock_data) } else { errno_result() } } /// Sets the current timestamp of kvmclock to the specified value. /// /// See the documentation on the KVM_SET_CLOCK ioctl. #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] pub fn set_clock(&self, clock_data: &kvm_clock_data) -> Result<()> { // Safe because we know that our file is a VM fd, we know the kernel will only read // correct amount of memory from our pointer, and we verify the return result. let ret = unsafe { ioctl_with_ref(self, KVM_SET_CLOCK(), clock_data) }; if ret == 0 { Ok(()) } else { errno_result() } } /// Crates an in kernel interrupt controller. /// /// See the documentation on the KVM_CREATE_IRQCHIP ioctl. #[cfg(any( target_arch = "x86", target_arch = "x86_64", target_arch = "arm", target_arch = "aarch64" ))] pub fn create_irq_chip(&self) -> Result<()> { // Safe because we know that our file is a VM fd and we verify the return result. let ret = unsafe { ioctl(self, KVM_CREATE_IRQCHIP()) }; if ret == 0 { Ok(()) } else { errno_result() } } /// Retrieves the state of given interrupt controller by issuing KVM_GET_IRQCHIP ioctl. /// /// Note that this call can only succeed after a call to `Vm::create_irq_chip`. #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] pub fn get_pic_state(&self, id: PicId) -> Result { let mut irqchip_state = kvm_irqchip { chip_id: id as u32, ..Default::default() }; let ret = unsafe { // Safe because we know our file is a VM fd, we know the kernel will only write // correct amount of memory to our pointer, and we verify the return result. ioctl_with_mut_ref(self, KVM_GET_IRQCHIP(), &mut irqchip_state) }; if ret == 0 { Ok(unsafe { // Safe as we know that we are retrieving data related to the // PIC (primary or secondary) and not IOAPIC. irqchip_state.chip.pic }) } else { errno_result() } } /// Sets the state of given interrupt controller by issuing KVM_SET_IRQCHIP ioctl. /// /// Note that this call can only succeed after a call to `Vm::create_irq_chip`. #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] pub fn set_pic_state(&self, id: PicId, state: &kvm_pic_state) -> Result<()> { let mut irqchip_state = kvm_irqchip { chip_id: id as u32, ..Default::default() }; irqchip_state.chip.pic = *state; // Safe because we know that our file is a VM fd, we know the kernel will only read // correct amount of memory from our pointer, and we verify the return result. let ret = unsafe { ioctl_with_ref(self, KVM_SET_IRQCHIP(), &irqchip_state) }; if ret == 0 { Ok(()) } else { errno_result() } } /// Retrieves the state of IOAPIC by issuing KVM_GET_IRQCHIP ioctl. /// /// Note that this call can only succeed after a call to `Vm::create_irq_chip`. #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] pub fn get_ioapic_state(&self) -> Result { let mut irqchip_state = kvm_irqchip { chip_id: 2, ..Default::default() }; let ret = unsafe { // Safe because we know our file is a VM fd, we know the kernel will only write // correct amount of memory to our pointer, and we verify the return result. ioctl_with_mut_ref(self, KVM_GET_IRQCHIP(), &mut irqchip_state) }; if ret == 0 { Ok(unsafe { // Safe as we know that we are retrieving data related to the // IOAPIC and not PIC. irqchip_state.chip.ioapic }) } else { errno_result() } } /// Sets the state of IOAPIC by issuing KVM_SET_IRQCHIP ioctl. /// /// Note that this call can only succeed after a call to `Vm::create_irq_chip`. #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] pub fn set_ioapic_state(&self, state: &kvm_ioapic_state) -> Result<()> { let mut irqchip_state = kvm_irqchip { chip_id: 2, ..Default::default() }; irqchip_state.chip.ioapic = *state; // Safe because we know that our file is a VM fd, we know the kernel will only read // correct amount of memory from our pointer, and we verify the return result. let ret = unsafe { ioctl_with_ref(self, KVM_SET_IRQCHIP(), &irqchip_state) }; if ret == 0 { Ok(()) } else { errno_result() } } /// Sets the level on the given irq to 1 if `active` is true, and 0 otherwise. #[cfg(any( target_arch = "x86", target_arch = "x86_64", target_arch = "arm", target_arch = "aarch64" ))] pub fn set_irq_line(&self, irq: u32, active: bool) -> Result<()> { let mut irq_level = kvm_irq_level::default(); irq_level.__bindgen_anon_1.irq = irq; irq_level.level = if active { 1 } else { 0 }; // Safe because we know that our file is a VM fd, we know the kernel will only read the // correct amount of memory from our pointer, and we verify the return result. let ret = unsafe { ioctl_with_ref(self, KVM_IRQ_LINE(), &irq_level) }; if ret == 0 { Ok(()) } else { errno_result() } } /// Creates a PIT as per the KVM_CREATE_PIT2 ioctl. /// /// Note that this call can only succeed after a call to `Vm::create_irq_chip`. #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] pub fn create_pit(&self) -> Result<()> { let pit_config = kvm_pit_config::default(); // Safe because we know that our file is a VM fd, we know the kernel will only read the // correct amount of memory from our pointer, and we verify the return result. let ret = unsafe { ioctl_with_ref(self, KVM_CREATE_PIT2(), &pit_config) }; if ret == 0 { Ok(()) } else { errno_result() } } /// Retrieves the state of PIT by issuing KVM_GET_PIT2 ioctl. /// /// Note that this call can only succeed after a call to `Vm::create_pit`. #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] pub fn get_pit_state(&self) -> Result { // Safe because we know that our file is a VM fd, we know the kernel will only write // correct amount of memory to our pointer, and we verify the return result. let mut pit_state = unsafe { std::mem::zeroed() }; let ret = unsafe { ioctl_with_mut_ref(self, KVM_GET_PIT2(), &mut pit_state) }; if ret == 0 { Ok(pit_state) } else { errno_result() } } /// Sets the state of PIT by issuing KVM_SET_PIT2 ioctl. /// /// Note that this call can only succeed after a call to `Vm::create_pit`. #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] pub fn set_pit_state(&self, pit_state: &kvm_pit_state2) -> Result<()> { // Safe because we know that our file is a VM fd, we know the kernel will only read // correct amount of memory from our pointer, and we verify the return result. let ret = unsafe { ioctl_with_ref(self, KVM_SET_PIT2(), pit_state) }; if ret == 0 { Ok(()) } else { errno_result() } } /// Registers an event to be signaled whenever a certain address is written to. /// /// The `datamatch` parameter can be used to limit signaling `evt` to only the cases where the /// value being written is equal to `datamatch`. Note that the size of `datamatch` is important /// and must match the expected size of the guest's write. /// /// In all cases where `evt` is signaled, the ordinary vmexit to userspace that would be /// triggered is prevented. pub fn register_ioevent( &self, evt: &Event, addr: IoeventAddress, datamatch: Datamatch, ) -> Result<()> { self.ioeventfd(evt, addr, datamatch, false) } /// Unregisters an event previously registered with `register_ioevent`. /// /// The `evt`, `addr`, and `datamatch` set must be the same as the ones passed into /// `register_ioevent`. pub fn unregister_ioevent( &self, evt: &Event, addr: IoeventAddress, datamatch: Datamatch, ) -> Result<()> { self.ioeventfd(evt, addr, datamatch, true) } fn ioeventfd( &self, evt: &Event, addr: IoeventAddress, datamatch: Datamatch, deassign: bool, ) -> Result<()> { let (do_datamatch, datamatch_value, datamatch_len) = match datamatch { Datamatch::AnyLength => (false, 0, 0), Datamatch::U8(v) => match v { Some(u) => (true, u as u64, 1), None => (false, 0, 1), }, Datamatch::U16(v) => match v { Some(u) => (true, u as u64, 2), None => (false, 0, 2), }, Datamatch::U32(v) => match v { Some(u) => (true, u as u64, 4), None => (false, 0, 4), }, Datamatch::U64(v) => match v { Some(u) => (true, u as u64, 8), None => (false, 0, 8), }, }; let mut flags = 0; if deassign { flags |= 1 << kvm_ioeventfd_flag_nr_deassign; } if do_datamatch { flags |= 1 << kvm_ioeventfd_flag_nr_datamatch } if let IoeventAddress::Pio(_) = addr { flags |= 1 << kvm_ioeventfd_flag_nr_pio; } let ioeventfd = kvm_ioeventfd { datamatch: datamatch_value, len: datamatch_len, addr: match addr { IoeventAddress::Pio(p) => p as u64, IoeventAddress::Mmio(m) => m, }, fd: evt.as_raw_descriptor(), flags, ..Default::default() }; // Safe because we know that our file is a VM fd, we know the kernel will only read the // correct amount of memory from our pointer, and we verify the return result. let ret = unsafe { ioctl_with_ref(self, KVM_IOEVENTFD(), &ioeventfd) }; if ret == 0 { Ok(()) } else { errno_result() } } /// Registers an event that will, when signalled, trigger the `gsi` irq, and `resample_evt` will /// get triggered when the irqchip is resampled. #[cfg(any( target_arch = "x86", target_arch = "x86_64", target_arch = "arm", target_arch = "aarch64" ))] pub fn register_irqfd_resample( &self, evt: &Event, resample_evt: &Event, gsi: u32, ) -> Result<()> { let irqfd = kvm_irqfd { flags: KVM_IRQFD_FLAG_RESAMPLE, fd: evt.as_raw_descriptor() as u32, resamplefd: resample_evt.as_raw_descriptor() as u32, gsi, ..Default::default() }; // Safe because we know that our file is a VM fd, we know the kernel will only read the // correct amount of memory from our pointer, and we verify the return result. let ret = unsafe { ioctl_with_ref(self, KVM_IRQFD(), &irqfd) }; if ret == 0 { Ok(()) } else { errno_result() } } /// Unregisters an event that was previously registered with /// `register_irqfd`/`register_irqfd_resample`. /// /// The `evt` and `gsi` pair must be the same as the ones passed into /// `register_irqfd`/`register_irqfd_resample`. #[cfg(any( target_arch = "x86", target_arch = "x86_64", target_arch = "arm", target_arch = "aarch64" ))] pub fn unregister_irqfd(&self, evt: &Event, gsi: u32) -> Result<()> { let irqfd = kvm_irqfd { fd: evt.as_raw_descriptor() as u32, gsi, flags: KVM_IRQFD_FLAG_DEASSIGN, ..Default::default() }; // Safe because we know that our file is a VM fd, we know the kernel will only read the // correct amount of memory from our pointer, and we verify the return result. let ret = unsafe { ioctl_with_ref(self, KVM_IRQFD(), &irqfd) }; if ret == 0 { Ok(()) } else { errno_result() } } /// Sets the GSI routing table, replacing any table set with previous calls to /// `set_gsi_routing`. #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] pub fn set_gsi_routing(&self, routes: &[IrqRoute]) -> Result<()> { let mut irq_routing = vec_with_array_field::(routes.len()); irq_routing[0].nr = routes.len() as u32; // Safe because we ensured there is enough space in irq_routing to hold the number of // route entries. let irq_routes = unsafe { irq_routing[0].entries.as_mut_slice(routes.len()) }; for (route, irq_route) in routes.iter().zip(irq_routes.iter_mut()) { irq_route.gsi = route.gsi; match route.source { IrqSource::Irqchip { chip, pin } => { irq_route.type_ = KVM_IRQ_ROUTING_IRQCHIP; irq_route.u.irqchip = kvm_irq_routing_irqchip { irqchip: chip, pin } } IrqSource::Msi { address, data } => { irq_route.type_ = KVM_IRQ_ROUTING_MSI; irq_route.u.msi = kvm_irq_routing_msi { address_lo: address as u32, address_hi: (address >> 32) as u32, data, ..Default::default() } } } } let ret = unsafe { ioctl_with_ref(self, KVM_SET_GSI_ROUTING(), &irq_routing[0]) }; if ret == 0 { Ok(()) } else { errno_result() } } /// Enable the specified capability. /// See documentation for KVM_ENABLE_CAP. /// This function is marked as unsafe because `cap` may contain values which are interpreted as /// pointers by the kernel. pub unsafe fn kvm_enable_cap(&self, cap: &kvm_enable_cap) -> Result<()> { // Safe because we allocated the struct and we know the kernel will read exactly the size of // the struct. let ret = ioctl_with_ref(self, KVM_ENABLE_CAP(), cap); if ret < 0 { errno_result() } else { Ok(()) } } } impl AsRawDescriptor for Vm { fn as_raw_descriptor(&self) -> RawDescriptor { self.vm.as_raw_descriptor() } } /// A reason why a VCPU exited. One of these returns every time `Vcpu::run` is called. #[derive(Debug)] pub enum VcpuExit { /// An out port instruction was run on the given port with the given data. IoOut { port: u16, size: usize, data: [u8; 8], }, /// An in port instruction was run on the given port. /// /// The date that the instruction receives should be set with `set_data` before `Vcpu::run` is /// called again. IoIn { port: u16, size: usize, }, /// A read instruction was run against the given MMIO address. /// /// The date that the instruction receives should be set with `set_data` before `Vcpu::run` is /// called again. MmioRead { address: u64, size: usize, }, /// A write instruction was run against the given MMIO address with the given data. MmioWrite { address: u64, size: usize, data: [u8; 8], }, IoapicEoi { vector: u8, }, HypervSynic { msr: u32, control: u64, evt_page: u64, msg_page: u64, }, HypervHcall { input: u64, params: [u64; 2], }, Unknown, Exception, Hypercall, Debug, Hlt, IrqWindowOpen, Shutdown, FailEntry { hardware_entry_failure_reason: u64, }, Intr, SetTpr, TprAccess, S390Sieic, S390Reset, Dcr, Nmi, InternalError, Osi, PaprHcall, S390Ucontrol, Watchdog, S390Tsch, Epr, /// The cpu triggered a system level event which is specified by the type field. /// The first field is the event type and the second field is flags. /// The possible event types are shutdown, reset, or crash. So far there /// are not any flags defined. SystemEvent(u32 /* event_type */, u64 /* flags */), } /// A wrapper around creating and using a VCPU. /// `Vcpu` provides all functionality except for running. To run, `to_runnable` must be called to /// lock the vcpu to a thread. Then the returned `RunnableVcpu` can be used for running. pub struct Vcpu { vcpu: File, run_mmap: MemoryMapping, } pub struct VcpuThread { run: *mut kvm_run, signal_num: Option, } thread_local!(static VCPU_THREAD: RefCell> = RefCell::new(None)); impl Vcpu { /// Constructs a new VCPU for `vm`. /// /// The `id` argument is the CPU number between [0, max vcpus). pub fn new(id: c_ulong, kvm: &Kvm, vm: &Vm) -> Result { let run_mmap_size = kvm.get_vcpu_mmap_size()?; // Safe because we know that vm a VM fd and we verify the return result. let vcpu_fd = unsafe { ioctl_with_val(vm, KVM_CREATE_VCPU(), id) }; if vcpu_fd < 0 { return errno_result(); } // Wrap the vcpu now in case the following ? returns early. This is safe because we verified // the value of the fd and we own the fd. let vcpu = unsafe { File::from_raw_descriptor(vcpu_fd) }; let run_mmap = MemoryMappingBuilder::new(run_mmap_size) .from_file(&vcpu) .build() .map_err(|_| Error::new(ENOSPC))?; Ok(Vcpu { vcpu, run_mmap }) } /// Consumes `self` and returns a `RunnableVcpu`. A `RunnableVcpu` is required to run the /// guest. /// Assigns a vcpu to the current thread and stores it in a hash map that can be used by signal /// handlers to call set_local_immediate_exit(). An optional signal number will be temporarily /// blocked while assigning the vcpu to the thread and later blocked when `RunnableVcpu` is /// destroyed. /// /// Returns an error, `EBUSY`, if the current thread already contains a Vcpu. #[allow(clippy::cast_ptr_alignment)] pub fn to_runnable(self, signal_num: Option) -> Result { // Block signal while we add -- if a signal fires (very unlikely, // as this means something is trying to pause the vcpu before it has // even started) it'll try to grab the read lock while this write // lock is grabbed and cause a deadlock. // Assuming that a failure to block means it's already blocked. let _blocked_signal = signal_num.map(BlockedSignal::new); VCPU_THREAD.with(|v| { if v.borrow().is_none() { *v.borrow_mut() = Some(VcpuThread { run: self.run_mmap.as_ptr() as *mut kvm_run, signal_num, }); Ok(()) } else { Err(Error::new(EBUSY)) } })?; Ok(RunnableVcpu { vcpu: self, phantom: Default::default(), }) } /// Sets the data received by a mmio read, ioport in, or hypercall instruction. /// /// This function should be called after `Vcpu::run` returns an `VcpuExit::IoIn`, /// `VcpuExit::MmioRead`, or 'VcpuExit::HypervHcall`. #[allow(clippy::cast_ptr_alignment)] pub fn set_data(&self, data: &[u8]) -> Result<()> { // Safe because we know we mapped enough memory to hold the kvm_run struct because the // kernel told us how large it was. The pointer is page aligned so casting to a different // type is well defined, hence the clippy allow attribute. let run = unsafe { &mut *(self.run_mmap.as_ptr() as *mut kvm_run) }; match run.exit_reason { KVM_EXIT_IO => { let run_start = run as *mut kvm_run as *mut u8; // Safe because the exit_reason (which comes from the kernel) told us which // union field to use. let io = unsafe { run.__bindgen_anon_1.io }; if io.direction as u32 != KVM_EXIT_IO_IN { return Err(Error::new(EINVAL)); } let data_size = (io.count as usize) * (io.size as usize); if data_size != data.len() { return Err(Error::new(EINVAL)); } // The data_offset is defined by the kernel to be some number of bytes into the // kvm_run structure, which we have fully mmap'd. unsafe { let data_ptr = run_start.offset(io.data_offset as isize); copy_nonoverlapping(data.as_ptr(), data_ptr, data_size); } Ok(()) } KVM_EXIT_MMIO => { // Safe because the exit_reason (which comes from the kernel) told us which // union field to use. let mmio = unsafe { &mut run.__bindgen_anon_1.mmio }; if mmio.is_write != 0 { return Err(Error::new(EINVAL)); } let len = mmio.len as usize; if len != data.len() { return Err(Error::new(EINVAL)); } mmio.data[..len].copy_from_slice(data); Ok(()) } KVM_EXIT_HYPERV => { // Safe because the exit_reason (which comes from the kernel) told us which // union field to use. let hyperv = unsafe { &mut run.__bindgen_anon_1.hyperv }; if hyperv.type_ != KVM_EXIT_HYPERV_HCALL { return Err(Error::new(EINVAL)); } let hcall = unsafe { &mut hyperv.u.hcall }; if data.len() != std::mem::size_of::() { return Err(Error::new(EINVAL)); } hcall.result.to_ne_bytes().copy_from_slice(data); Ok(()) } _ => Err(Error::new(EINVAL)), } } /// Sets the bit that requests an immediate exit. #[allow(clippy::cast_ptr_alignment)] pub fn set_immediate_exit(&self, exit: bool) { // Safe because we know we mapped enough memory to hold the kvm_run struct because the // kernel told us how large it was. The pointer is page aligned so casting to a different // type is well defined, hence the clippy allow attribute. let run = unsafe { &mut *(self.run_mmap.as_ptr() as *mut kvm_run) }; run.immediate_exit = if exit { 1 } else { 0 }; } /// Sets/clears the bit for immediate exit for the vcpu on the current thread. pub fn set_local_immediate_exit(exit: bool) { VCPU_THREAD.with(|v| { if let Some(state) = &(*v.borrow()) { unsafe { (*state.run).immediate_exit = if exit { 1 } else { 0 }; }; } }); } /// Gets the VCPU registers. #[cfg(not(any(target_arch = "arm", target_arch = "aarch64")))] pub fn get_regs(&self) -> Result { // Safe because we know that our file is a VCPU fd, we know the kernel will only read the // correct amount of memory from our pointer, and we verify the return result. let mut regs = unsafe { std::mem::zeroed() }; let ret = unsafe { ioctl_with_mut_ref(self, KVM_GET_REGS(), &mut regs) }; if ret != 0 { return errno_result(); } Ok(regs) } /// Sets the VCPU registers. #[cfg(not(any(target_arch = "arm", target_arch = "aarch64")))] pub fn set_regs(&self, regs: &kvm_regs) -> Result<()> { // Safe because we know that our file is a VCPU fd, we know the kernel will only read the // correct amount of memory from our pointer, and we verify the return result. let ret = unsafe { ioctl_with_ref(self, KVM_SET_REGS(), regs) }; if ret != 0 { return errno_result(); } Ok(()) } /// Gets the VCPU special registers. #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] pub fn get_sregs(&self) -> Result { // Safe because we know that our file is a VCPU fd, we know the kernel will only write the // correct amount of memory to our pointer, and we verify the return result. let mut regs = unsafe { std::mem::zeroed() }; let ret = unsafe { ioctl_with_mut_ref(self, KVM_GET_SREGS(), &mut regs) }; if ret != 0 { return errno_result(); } Ok(regs) } /// Sets the VCPU special registers. #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] pub fn set_sregs(&self, sregs: &kvm_sregs) -> Result<()> { // Safe because we know that our file is a VCPU fd, we know the kernel will only read the // correct amount of memory from our pointer, and we verify the return result. let ret = unsafe { ioctl_with_ref(self, KVM_SET_SREGS(), sregs) }; if ret != 0 { return errno_result(); } Ok(()) } /// Gets the VCPU FPU registers. #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] pub fn get_fpu(&self) -> Result { // Safe because we know that our file is a VCPU fd, we know the kernel will only write the // correct amount of memory to our pointer, and we verify the return result. let mut regs = unsafe { std::mem::zeroed() }; let ret = unsafe { ioctl_with_mut_ref(self, KVM_GET_FPU(), &mut regs) }; if ret != 0 { return errno_result(); } Ok(regs) } /// X86 specific call to setup the FPU /// /// See the documentation for KVM_SET_FPU. #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] pub fn set_fpu(&self, fpu: &kvm_fpu) -> Result<()> { let ret = unsafe { // Here we trust the kernel not to read past the end of the kvm_fpu struct. ioctl_with_ref(self, KVM_SET_FPU(), fpu) }; if ret < 0 { return errno_result(); } Ok(()) } /// Gets the VCPU debug registers. #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] pub fn get_debugregs(&self) -> Result { // Safe because we know that our file is a VCPU fd, we know the kernel will only write the // correct amount of memory to our pointer, and we verify the return result. let mut regs = unsafe { std::mem::zeroed() }; let ret = unsafe { ioctl_with_mut_ref(self, KVM_GET_DEBUGREGS(), &mut regs) }; if ret != 0 { return errno_result(); } Ok(regs) } /// Sets the VCPU debug registers #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] pub fn set_debugregs(&self, dregs: &kvm_debugregs) -> Result<()> { let ret = unsafe { // Here we trust the kernel not to read past the end of the kvm_fpu struct. ioctl_with_ref(self, KVM_SET_DEBUGREGS(), dregs) }; if ret < 0 { return errno_result(); } Ok(()) } /// Gets the VCPU extended control registers #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] pub fn get_xcrs(&self) -> Result { // Safe because we know that our file is a VCPU fd, we know the kernel will only write the // correct amount of memory to our pointer, and we verify the return result. let mut regs = unsafe { std::mem::zeroed() }; let ret = unsafe { ioctl_with_mut_ref(self, KVM_GET_XCRS(), &mut regs) }; if ret != 0 { return errno_result(); } Ok(regs) } /// Sets the VCPU extended control registers #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] pub fn set_xcrs(&self, xcrs: &kvm_xcrs) -> Result<()> { let ret = unsafe { // Here we trust the kernel not to read past the end of the kvm_xcrs struct. ioctl_with_ref(self, KVM_SET_XCRS(), xcrs) }; if ret < 0 { return errno_result(); } Ok(()) } /// X86 specific call to get the MSRS /// /// See the documentation for KVM_SET_MSRS. #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] pub fn get_msrs(&self, msr_entries: &mut Vec) -> Result<()> { let mut msrs = vec_with_array_field::(msr_entries.len()); unsafe { // Mapping the unsized array to a slice is unsafe because the length isn't known. // Providing the length used to create the struct guarantees the entire slice is valid. let entries: &mut [kvm_msr_entry] = msrs[0].entries.as_mut_slice(msr_entries.len()); entries.copy_from_slice(&msr_entries); } msrs[0].nmsrs = msr_entries.len() as u32; let ret = unsafe { // Here we trust the kernel not to read or write past the end of the kvm_msrs struct. ioctl_with_ref(self, KVM_GET_MSRS(), &msrs[0]) }; if ret < 0 { // KVM_SET_MSRS actually returns the number of msr entries written. return errno_result(); } unsafe { let count = ret as usize; assert!(count <= msr_entries.len()); let entries: &mut [kvm_msr_entry] = msrs[0].entries.as_mut_slice(count); msr_entries.truncate(count); msr_entries.copy_from_slice(&entries); } Ok(()) } /// X86 specific call to setup the MSRS /// /// See the documentation for KVM_SET_MSRS. #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] pub fn set_msrs(&self, msrs: &kvm_msrs) -> Result<()> { let ret = unsafe { // Here we trust the kernel not to read past the end of the kvm_msrs struct. ioctl_with_ref(self, KVM_SET_MSRS(), msrs) }; if ret < 0 { // KVM_SET_MSRS actually returns the number of msr entries written. return errno_result(); } Ok(()) } /// X86 specific call to setup the CPUID registers /// /// See the documentation for KVM_SET_CPUID2. #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] pub fn set_cpuid2(&self, cpuid: &CpuId) -> Result<()> { let ret = unsafe { // Here we trust the kernel not to read past the end of the kvm_msrs struct. ioctl_with_ptr(self, KVM_SET_CPUID2(), cpuid.as_ptr()) }; if ret < 0 { return errno_result(); } Ok(()) } /// X86 specific call to get the system emulated hyper-v CPUID values #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] pub fn get_hyperv_cpuid(&self) -> Result { const MAX_KVM_CPUID_ENTRIES: usize = 256; let mut cpuid = CpuId::new(MAX_KVM_CPUID_ENTRIES); let ret = unsafe { // ioctl is unsafe. The kernel is trusted not to write beyond the bounds of the memory // allocated for the struct. The limit is read from nent, which is set to the allocated // size(MAX_KVM_CPUID_ENTRIES) above. ioctl_with_mut_ptr(self, KVM_GET_SUPPORTED_HV_CPUID(), cpuid.as_mut_ptr()) }; if ret < 0 { return errno_result(); } Ok(cpuid) } /// X86 specific call to get the state of the "Local Advanced Programmable Interrupt Controller". /// /// See the documentation for KVM_GET_LAPIC. #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] pub fn get_lapic(&self) -> Result { let mut klapic: kvm_lapic_state = Default::default(); let ret = unsafe { // The ioctl is unsafe unless you trust the kernel not to write past the end of the // local_apic struct. ioctl_with_mut_ref(self, KVM_GET_LAPIC(), &mut klapic) }; if ret < 0 { return errno_result(); } Ok(klapic) } /// X86 specific call to set the state of the "Local Advanced Programmable Interrupt Controller". /// /// See the documentation for KVM_SET_LAPIC. #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] pub fn set_lapic(&self, klapic: &kvm_lapic_state) -> Result<()> { let ret = unsafe { // The ioctl is safe because the kernel will only read from the klapic struct. ioctl_with_ref(self, KVM_SET_LAPIC(), klapic) }; if ret < 0 { return errno_result(); } Ok(()) } /// Gets the vcpu's current "multiprocessing state". /// /// See the documentation for KVM_GET_MP_STATE. This call can only succeed after /// a call to `Vm::create_irq_chip`. /// /// Note that KVM defines the call for both x86 and s390 but we do not expect anyone /// to run crosvm on s390. #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] pub fn get_mp_state(&self) -> Result { // Safe because we know that our file is a VCPU fd, we know the kernel will only // write correct amount of memory to our pointer, and we verify the return result. let mut state: kvm_mp_state = unsafe { std::mem::zeroed() }; let ret = unsafe { ioctl_with_mut_ref(self, KVM_GET_MP_STATE(), &mut state) }; if ret < 0 { return errno_result(); } Ok(state) } /// Sets the vcpu's current "multiprocessing state". /// /// See the documentation for KVM_SET_MP_STATE. This call can only succeed after /// a call to `Vm::create_irq_chip`. /// /// Note that KVM defines the call for both x86 and s390 but we do not expect anyone /// to run crosvm on s390. #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] pub fn set_mp_state(&self, state: &kvm_mp_state) -> Result<()> { let ret = unsafe { // The ioctl is safe because the kernel will only read from the kvm_mp_state struct. ioctl_with_ref(self, KVM_SET_MP_STATE(), state) }; if ret < 0 { return errno_result(); } Ok(()) } /// Gets the vcpu's currently pending exceptions, interrupts, NMIs, etc /// /// See the documentation for KVM_GET_VCPU_EVENTS. /// #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] pub fn get_vcpu_events(&self) -> Result { // Safe because we know that our file is a VCPU fd, we know the kernel // will only write correct amount of memory to our pointer, and we // verify the return result. let mut events: kvm_vcpu_events = unsafe { std::mem::zeroed() }; let ret = unsafe { ioctl_with_mut_ref(self, KVM_GET_VCPU_EVENTS(), &mut events) }; if ret < 0 { return errno_result(); } Ok(events) } /// Sets the vcpu's currently pending exceptions, interrupts, NMIs, etc /// /// See the documentation for KVM_SET_VCPU_EVENTS. /// #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] pub fn set_vcpu_events(&self, events: &kvm_vcpu_events) -> Result<()> { let ret = unsafe { // The ioctl is safe because the kernel will only read from the // kvm_vcpu_events. ioctl_with_ref(self, KVM_SET_VCPU_EVENTS(), events) }; if ret < 0 { return errno_result(); } Ok(()) } /// Enable the specified capability. /// See documentation for KVM_ENABLE_CAP. /// This function is marked as unsafe because `cap` may contain values which are interpreted as /// pointers by the kernel. pub unsafe fn kvm_enable_cap(&self, cap: &kvm_enable_cap) -> Result<()> { // Safe because we allocated the struct and we know the kernel will read exactly the size of // the struct. let ret = ioctl_with_ref(self, KVM_ENABLE_CAP(), cap); if ret < 0 { return errno_result(); } Ok(()) } /// Specifies set of signals that are blocked during execution of KVM_RUN. /// Signals that are not blocked will cause KVM_RUN to return with -EINTR. /// /// See the documentation for KVM_SET_SIGNAL_MASK pub fn set_signal_mask(&self, signals: &[c_int]) -> Result<()> { let sigset = signal::create_sigset(signals)?; let mut kvm_sigmask = vec_with_array_field::(1); // Rust definition of sigset_t takes 128 bytes, but the kernel only // expects 8-bytes structure, so we can't write // kvm_sigmask.len = size_of::() as u32; kvm_sigmask[0].len = 8; // Ensure the length is not too big. const _ASSERT: usize = size_of::() - 8usize; // Safe as we allocated exactly the needed space unsafe { copy_nonoverlapping( &sigset as *const sigset_t as *const u8, kvm_sigmask[0].sigset.as_mut_ptr(), 8, ); } let ret = unsafe { // The ioctl is safe because the kernel will only read from the // kvm_signal_mask structure. ioctl_with_ref(self, KVM_SET_SIGNAL_MASK(), &kvm_sigmask[0]) }; if ret < 0 { return errno_result(); } Ok(()) } /// Sets the value of one register on this VCPU. The id of the register is /// encoded as specified in the kernel documentation for KVM_SET_ONE_REG. #[cfg(any(target_arch = "arm", target_arch = "aarch64"))] pub fn set_one_reg(&self, reg_id: u64, data: u64) -> Result<()> { let data_ref = &data as *const u64; let onereg = kvm_one_reg { id: reg_id, addr: data_ref as u64, }; // safe because we allocated the struct and we know the kernel will read // exactly the size of the struct let ret = unsafe { ioctl_with_ref(self, KVM_SET_ONE_REG(), &onereg) }; if ret < 0 { return errno_result(); } Ok(()) } } impl AsRawDescriptor for Vcpu { fn as_raw_descriptor(&self) -> RawDescriptor { self.vcpu.as_raw_descriptor() } } /// A Vcpu that has a thread and can be run. Created by calling `to_runnable` on a `Vcpu`. /// Implements `Deref` to a `Vcpu` so all `Vcpu` methods are usable, with the addition of the `run` /// function to execute the guest. pub struct RunnableVcpu { vcpu: Vcpu, // vcpus must stay on the same thread once they start. // Add the PhantomData pointer to ensure RunnableVcpu is not `Send`. phantom: std::marker::PhantomData<*mut u8>, } impl RunnableVcpu { /// Runs the VCPU until it exits, returning the reason for the exit. /// /// Note that the state of the VCPU and associated VM must be setup first for this to do /// anything useful. #[allow(clippy::cast_ptr_alignment)] // The pointer is page aligned so casting to a different type is well defined, hence the clippy // allow attribute. pub fn run(&self) -> Result { // Safe because we know that our file is a VCPU fd and we verify the return result. let ret = unsafe { ioctl(self, KVM_RUN()) }; if ret == 0 { // Safe because we know we mapped enough memory to hold the kvm_run struct because the // kernel told us how large it was. let run = unsafe { &*(self.run_mmap.as_ptr() as *const kvm_run) }; match run.exit_reason { KVM_EXIT_IO => { // Safe because the exit_reason (which comes from the kernel) told us which // union field to use. let io = unsafe { run.__bindgen_anon_1.io }; let port = io.port; let size = (io.count as usize) * (io.size as usize); match io.direction as u32 { KVM_EXIT_IO_IN => Ok(VcpuExit::IoIn { port, size }), KVM_EXIT_IO_OUT => { let mut data = [0; 8]; let run_start = run as *const kvm_run as *const u8; // The data_offset is defined by the kernel to be some number of bytes // into the kvm_run structure, which we have fully mmap'd. unsafe { let data_ptr = run_start.offset(io.data_offset as isize); copy_nonoverlapping( data_ptr, data.as_mut_ptr(), min(size, data.len()), ); } Ok(VcpuExit::IoOut { port, size, data }) } _ => Err(Error::new(EINVAL)), } } KVM_EXIT_MMIO => { // Safe because the exit_reason (which comes from the kernel) told us which // union field to use. let mmio = unsafe { &run.__bindgen_anon_1.mmio }; let address = mmio.phys_addr; let size = min(mmio.len as usize, mmio.data.len()); if mmio.is_write != 0 { Ok(VcpuExit::MmioWrite { address, size, data: mmio.data, }) } else { Ok(VcpuExit::MmioRead { address, size }) } } KVM_EXIT_IOAPIC_EOI => { // Safe because the exit_reason (which comes from the kernel) told us which // union field to use. let vector = unsafe { run.__bindgen_anon_1.eoi.vector }; Ok(VcpuExit::IoapicEoi { vector }) } KVM_EXIT_HYPERV => { // Safe because the exit_reason (which comes from the kernel) told us which // union field to use. let hyperv = unsafe { &run.__bindgen_anon_1.hyperv }; match hyperv.type_ as u32 { KVM_EXIT_HYPERV_SYNIC => { let synic = unsafe { &hyperv.u.synic }; Ok(VcpuExit::HypervSynic { msr: synic.msr, control: synic.control, evt_page: synic.evt_page, msg_page: synic.msg_page, }) } KVM_EXIT_HYPERV_HCALL => { let hcall = unsafe { &hyperv.u.hcall }; Ok(VcpuExit::HypervHcall { input: hcall.input, params: hcall.params, }) } _ => Err(Error::new(EINVAL)), } } KVM_EXIT_UNKNOWN => Ok(VcpuExit::Unknown), KVM_EXIT_EXCEPTION => Ok(VcpuExit::Exception), KVM_EXIT_HYPERCALL => Ok(VcpuExit::Hypercall), KVM_EXIT_DEBUG => Ok(VcpuExit::Debug), KVM_EXIT_HLT => Ok(VcpuExit::Hlt), KVM_EXIT_IRQ_WINDOW_OPEN => Ok(VcpuExit::IrqWindowOpen), KVM_EXIT_SHUTDOWN => Ok(VcpuExit::Shutdown), KVM_EXIT_FAIL_ENTRY => { // Safe because the exit_reason (which comes from the kernel) told us which // union field to use. let hardware_entry_failure_reason = unsafe { run.__bindgen_anon_1 .fail_entry .hardware_entry_failure_reason }; Ok(VcpuExit::FailEntry { hardware_entry_failure_reason, }) } KVM_EXIT_INTR => Ok(VcpuExit::Intr), KVM_EXIT_SET_TPR => Ok(VcpuExit::SetTpr), KVM_EXIT_TPR_ACCESS => Ok(VcpuExit::TprAccess), KVM_EXIT_S390_SIEIC => Ok(VcpuExit::S390Sieic), KVM_EXIT_S390_RESET => Ok(VcpuExit::S390Reset), KVM_EXIT_DCR => Ok(VcpuExit::Dcr), KVM_EXIT_NMI => Ok(VcpuExit::Nmi), KVM_EXIT_INTERNAL_ERROR => Ok(VcpuExit::InternalError), KVM_EXIT_OSI => Ok(VcpuExit::Osi), KVM_EXIT_PAPR_HCALL => Ok(VcpuExit::PaprHcall), KVM_EXIT_S390_UCONTROL => Ok(VcpuExit::S390Ucontrol), KVM_EXIT_WATCHDOG => Ok(VcpuExit::Watchdog), KVM_EXIT_S390_TSCH => Ok(VcpuExit::S390Tsch), KVM_EXIT_EPR => Ok(VcpuExit::Epr), KVM_EXIT_SYSTEM_EVENT => { // Safe because we know the exit reason told us this union // field is valid let event_type = unsafe { run.__bindgen_anon_1.system_event.type_ }; let event_flags = unsafe { run.__bindgen_anon_1.system_event.flags }; Ok(VcpuExit::SystemEvent(event_type, event_flags)) } r => panic!("unknown kvm exit reason: {}", r), } } else { errno_result() } } } impl Deref for RunnableVcpu { type Target = Vcpu; fn deref(&self) -> &Self::Target { &self.vcpu } } impl DerefMut for RunnableVcpu { fn deref_mut(&mut self) -> &mut Self::Target { &mut self.vcpu } } impl AsRawDescriptor for RunnableVcpu { fn as_raw_descriptor(&self) -> RawDescriptor { self.vcpu.as_raw_descriptor() } } impl Drop for RunnableVcpu { fn drop(&mut self) { VCPU_THREAD.with(|v| { // This assumes that a failure in `BlockedSignal::new` means the signal is already // blocked and there it should not be unblocked on exit. let _blocked_signal = &(*v.borrow()) .as_ref() .and_then(|state| state.signal_num) .map(BlockedSignal::new); *v.borrow_mut() = None; }); } } /// Wrapper for kvm_cpuid2 which has a zero length array at the end. /// Hides the zero length array behind a bounds check. #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] pub type CpuId = FlexibleArrayWrapper; // Represents a temporarily blocked signal. It will unblock the signal when dropped. struct BlockedSignal { signal_num: c_int, } impl BlockedSignal { // Returns a `BlockedSignal` if the specified signal can be blocked, otherwise None. fn new(signal_num: c_int) -> Option { if block_signal(signal_num).is_ok() { Some(BlockedSignal { signal_num }) } else { None } } } impl Drop for BlockedSignal { fn drop(&mut self) { let _ = unblock_signal(self.signal_num).expect("failed to restore signal mask"); } } #[cfg(test)] mod tests { use super::*; #[test] fn dirty_log_size() { let page_size = pagesize(); assert_eq!(dirty_log_bitmap_size(0), 0); assert_eq!(dirty_log_bitmap_size(page_size), 1); assert_eq!(dirty_log_bitmap_size(page_size * 8), 1); assert_eq!(dirty_log_bitmap_size(page_size * 8 + 1), 2); assert_eq!(dirty_log_bitmap_size(page_size * 100), 13); } #[test] fn new() { Kvm::new().unwrap(); } #[test] fn create_vm() { let kvm = Kvm::new().unwrap(); let gm = GuestMemory::new(&vec![(GuestAddress(0), 0x1000)]).unwrap(); Vm::new(&kvm, gm).unwrap(); } #[test] fn check_extension() { let kvm = Kvm::new().unwrap(); assert!(kvm.check_extension(Cap::UserMemory)); // I assume nobody is testing this on s390 assert!(!kvm.check_extension(Cap::S390UserSigp)); } #[test] fn check_vm_extension() { let kvm = Kvm::new().unwrap(); let gm = GuestMemory::new(&vec![(GuestAddress(0), 0x1000)]).unwrap(); let vm = Vm::new(&kvm, gm).unwrap(); assert!(vm.check_extension(Cap::UserMemory)); // I assume nobody is testing this on s390 assert!(!vm.check_extension(Cap::S390UserSigp)); } #[test] #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] fn get_supported_cpuid() { let kvm = Kvm::new().unwrap(); let mut cpuid = kvm.get_supported_cpuid().unwrap(); let cpuid_entries = cpuid.mut_entries_slice(); assert!(cpuid_entries.len() > 0); } #[test] #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] fn get_emulated_cpuid() { let kvm = Kvm::new().unwrap(); kvm.get_emulated_cpuid().unwrap(); } #[test] #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] fn get_msr_index_list() { let kvm = Kvm::new().unwrap(); let msr_list = kvm.get_msr_index_list().unwrap(); assert!(msr_list.len() >= 2); } #[test] fn add_memory() { let kvm = Kvm::new().unwrap(); let gm = GuestMemory::new(&vec![ (GuestAddress(0), 0x1000), (GuestAddress(0x5000), 0x5000), ]) .unwrap(); let mut vm = Vm::new(&kvm, gm).unwrap(); let mem_size = 0x1000; let mem = MemoryMappingBuilder::new(mem_size).build().unwrap(); vm.add_memory_region(GuestAddress(0x1000), Box::new(mem), false, false) .unwrap(); let mem = MemoryMappingBuilder::new(mem_size).build().unwrap(); vm.add_memory_region(GuestAddress(0x10000), Box::new(mem), false, false) .unwrap(); } #[test] fn add_memory_ro() { let kvm = Kvm::new().unwrap(); let gm = GuestMemory::new(&vec![(GuestAddress(0), 0x1000)]).unwrap(); let mut vm = Vm::new(&kvm, gm).unwrap(); let mem_size = 0x1000; let mem = MemoryMappingBuilder::new(mem_size).build().unwrap(); vm.add_memory_region(GuestAddress(0x1000), Box::new(mem), true, false) .unwrap(); } #[test] fn remove_memory_region() { let kvm = Kvm::new().unwrap(); let gm = GuestMemory::new(&vec![(GuestAddress(0), 0x1000)]).unwrap(); let mut vm = Vm::new(&kvm, gm).unwrap(); let mem_size = 0x1000; let mem = MemoryMappingBuilder::new(mem_size).build().unwrap(); let mem_ptr = mem.as_ptr(); let slot = vm .add_memory_region(GuestAddress(0x1000), Box::new(mem), false, false) .unwrap(); let removed_mem = vm.remove_memory_region(slot).unwrap(); assert_eq!(removed_mem.size(), mem_size); assert_eq!(removed_mem.as_ptr(), mem_ptr); } #[test] fn remove_invalid_memory() { let kvm = Kvm::new().unwrap(); let gm = GuestMemory::new(&vec![(GuestAddress(0), 0x1000)]).unwrap(); let mut vm = Vm::new(&kvm, gm).unwrap(); assert!(vm.remove_memory_region(0).is_err()); } #[test] fn overlap_memory() { let kvm = Kvm::new().unwrap(); let gm = GuestMemory::new(&vec![(GuestAddress(0), 0x10000)]).unwrap(); let mut vm = Vm::new(&kvm, gm).unwrap(); let mem_size = 0x2000; let mem = MemoryMappingBuilder::new(mem_size).build().unwrap(); assert!(vm .add_memory_region(GuestAddress(0x2000), Box::new(mem), false, false) .is_err()); } #[test] fn get_memory() { let kvm = Kvm::new().unwrap(); let gm = GuestMemory::new(&vec![(GuestAddress(0), 0x1000)]).unwrap(); let vm = Vm::new(&kvm, gm).unwrap(); let obj_addr = GuestAddress(0xf0); vm.get_memory().write_obj_at_addr(67u8, obj_addr).unwrap(); let read_val: u8 = vm.get_memory().read_obj_from_addr(obj_addr).unwrap(); assert_eq!(read_val, 67u8); } #[test] #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] fn clock_handling() { let kvm = Kvm::new().unwrap(); let gm = GuestMemory::new(&vec![(GuestAddress(0), 0x10000)]).unwrap(); let vm = Vm::new(&kvm, gm).unwrap(); let mut clock_data = vm.get_clock().unwrap(); clock_data.clock += 1000; vm.set_clock(&clock_data).unwrap(); } #[test] #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] fn pic_handling() { let kvm = Kvm::new().unwrap(); let gm = GuestMemory::new(&vec![(GuestAddress(0), 0x10000)]).unwrap(); let vm = Vm::new(&kvm, gm).unwrap(); vm.create_irq_chip().unwrap(); let pic_state = vm.get_pic_state(PicId::Secondary).unwrap(); vm.set_pic_state(PicId::Secondary, &pic_state).unwrap(); } #[test] #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] fn ioapic_handling() { let kvm = Kvm::new().unwrap(); let gm = GuestMemory::new(&vec![(GuestAddress(0), 0x10000)]).unwrap(); let vm = Vm::new(&kvm, gm).unwrap(); vm.create_irq_chip().unwrap(); let ioapic_state = vm.get_ioapic_state().unwrap(); vm.set_ioapic_state(&ioapic_state).unwrap(); } #[test] #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] fn pit_handling() { let kvm = Kvm::new().unwrap(); let gm = GuestMemory::new(&vec![(GuestAddress(0), 0x10000)]).unwrap(); let vm = Vm::new(&kvm, gm).unwrap(); vm.create_irq_chip().unwrap(); vm.create_pit().unwrap(); let pit_state = vm.get_pit_state().unwrap(); vm.set_pit_state(&pit_state).unwrap(); } #[test] fn register_ioevent() { let kvm = Kvm::new().unwrap(); let gm = GuestMemory::new(&vec![(GuestAddress(0), 0x10000)]).unwrap(); let vm = Vm::new(&kvm, gm).unwrap(); let evtfd = Event::new().unwrap(); vm.register_ioevent(&evtfd, IoeventAddress::Pio(0xf4), Datamatch::AnyLength) .unwrap(); vm.register_ioevent(&evtfd, IoeventAddress::Mmio(0x1000), Datamatch::AnyLength) .unwrap(); vm.register_ioevent( &evtfd, IoeventAddress::Pio(0xc1), Datamatch::U8(Some(0x7fu8)), ) .unwrap(); vm.register_ioevent( &evtfd, IoeventAddress::Pio(0xc2), Datamatch::U16(Some(0x1337u16)), ) .unwrap(); vm.register_ioevent( &evtfd, IoeventAddress::Pio(0xc4), Datamatch::U32(Some(0xdeadbeefu32)), ) .unwrap(); vm.register_ioevent( &evtfd, IoeventAddress::Pio(0xc8), Datamatch::U64(Some(0xdeadbeefdeadbeefu64)), ) .unwrap(); } #[test] fn unregister_ioevent() { let kvm = Kvm::new().unwrap(); let gm = GuestMemory::new(&vec![(GuestAddress(0), 0x10000)]).unwrap(); let vm = Vm::new(&kvm, gm).unwrap(); let evtfd = Event::new().unwrap(); vm.register_ioevent(&evtfd, IoeventAddress::Pio(0xf4), Datamatch::AnyLength) .unwrap(); vm.register_ioevent(&evtfd, IoeventAddress::Mmio(0x1000), Datamatch::AnyLength) .unwrap(); vm.register_ioevent( &evtfd, IoeventAddress::Mmio(0x1004), Datamatch::U8(Some(0x7fu8)), ) .unwrap(); vm.unregister_ioevent(&evtfd, IoeventAddress::Pio(0xf4), Datamatch::AnyLength) .unwrap(); vm.unregister_ioevent(&evtfd, IoeventAddress::Mmio(0x1000), Datamatch::AnyLength) .unwrap(); vm.unregister_ioevent( &evtfd, IoeventAddress::Mmio(0x1004), Datamatch::U8(Some(0x7fu8)), ) .unwrap(); } #[test] fn irqfd_resample() { let kvm = Kvm::new().unwrap(); let gm = GuestMemory::new(&vec![(GuestAddress(0), 0x10000)]).unwrap(); let vm = Vm::new(&kvm, gm).unwrap(); let evtfd1 = Event::new().unwrap(); let evtfd2 = Event::new().unwrap(); vm.create_irq_chip().unwrap(); vm.register_irqfd_resample(&evtfd1, &evtfd2, 4).unwrap(); vm.unregister_irqfd(&evtfd1, 4).unwrap(); // Ensures the ioctl is actually reading the resamplefd. vm.register_irqfd_resample(&evtfd1, unsafe { &Event::from_raw_descriptor(-1) }, 4) .unwrap_err(); } #[test] #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] fn set_gsi_routing() { let kvm = Kvm::new().unwrap(); let gm = GuestMemory::new(&vec![(GuestAddress(0), 0x10000)]).unwrap(); let vm = Vm::new(&kvm, gm).unwrap(); vm.create_irq_chip().unwrap(); vm.set_gsi_routing(&[]).unwrap(); vm.set_gsi_routing(&[IrqRoute { gsi: 1, source: IrqSource::Irqchip { chip: KVM_IRQCHIP_IOAPIC, pin: 3, }, }]) .unwrap(); vm.set_gsi_routing(&[IrqRoute { gsi: 1, source: IrqSource::Msi { address: 0xf000000, data: 0xa0, }, }]) .unwrap(); vm.set_gsi_routing(&[ IrqRoute { gsi: 1, source: IrqSource::Irqchip { chip: KVM_IRQCHIP_IOAPIC, pin: 3, }, }, IrqRoute { gsi: 2, source: IrqSource::Msi { address: 0xf000000, data: 0xa0, }, }, ]) .unwrap(); } #[test] fn create_vcpu() { let kvm = Kvm::new().unwrap(); let gm = GuestMemory::new(&vec![(GuestAddress(0), 0x10000)]).unwrap(); let vm = Vm::new(&kvm, gm).unwrap(); Vcpu::new(0, &kvm, &vm).unwrap(); } #[test] #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] fn debugregs() { let kvm = Kvm::new().unwrap(); let gm = GuestMemory::new(&vec![(GuestAddress(0), 0x10000)]).unwrap(); let vm = Vm::new(&kvm, gm).unwrap(); let vcpu = Vcpu::new(0, &kvm, &vm).unwrap(); let mut dregs = vcpu.get_debugregs().unwrap(); dregs.dr7 = 13; vcpu.set_debugregs(&dregs).unwrap(); let dregs2 = vcpu.get_debugregs().unwrap(); assert_eq!(dregs.dr7, dregs2.dr7); } #[test] #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] fn xcrs() { let kvm = Kvm::new().unwrap(); if !kvm.check_extension(Cap::Xcrs) { return; } let gm = GuestMemory::new(&vec![(GuestAddress(0), 0x10000)]).unwrap(); let vm = Vm::new(&kvm, gm).unwrap(); let vcpu = Vcpu::new(0, &kvm, &vm).unwrap(); let mut xcrs = vcpu.get_xcrs().unwrap(); xcrs.xcrs[0].value = 1; vcpu.set_xcrs(&xcrs).unwrap(); let xcrs2 = vcpu.get_xcrs().unwrap(); assert_eq!(xcrs.xcrs[0].value, xcrs2.xcrs[0].value); } #[test] #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] fn get_msrs() { let kvm = Kvm::new().unwrap(); let gm = GuestMemory::new(&vec![(GuestAddress(0), 0x10000)]).unwrap(); let vm = Vm::new(&kvm, gm).unwrap(); let vcpu = Vcpu::new(0, &kvm, &vm).unwrap(); let mut msrs = vec![ // This one should succeed kvm_msr_entry { index: 0x0000011e, ..Default::default() }, // This one will fail to fetch kvm_msr_entry { index: 0x000003f1, ..Default::default() }, ]; vcpu.get_msrs(&mut msrs).unwrap(); assert_eq!(msrs.len(), 1); } #[test] #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] fn get_hyperv_cpuid() { let kvm = Kvm::new().unwrap(); let gm = GuestMemory::new(&vec![(GuestAddress(0), 0x10000)]).unwrap(); let vm = Vm::new(&kvm, gm).unwrap(); let vcpu = Vcpu::new(0, &kvm, &vm).unwrap(); let cpuid = vcpu.get_hyperv_cpuid(); // Older kernels don't support so tolerate this kind of failure. match cpuid { Ok(_) => {} Err(e) => { assert_eq!(e.errno(), EINVAL); } } } #[test] #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] fn enable_feature() { let kvm = Kvm::new().unwrap(); let gm = GuestMemory::new(&vec![(GuestAddress(0), 0x10000)]).unwrap(); let vm = Vm::new(&kvm, gm).unwrap(); vm.create_irq_chip().unwrap(); let vcpu = Vcpu::new(0, &kvm, &vm).unwrap(); let mut cap: kvm_enable_cap = Default::default(); cap.cap = kvm_sys::KVM_CAP_HYPERV_SYNIC; unsafe { vcpu.kvm_enable_cap(&cap) }.unwrap(); } #[test] #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] fn mp_state() { let kvm = Kvm::new().unwrap(); let gm = GuestMemory::new(&vec![(GuestAddress(0), 0x10000)]).unwrap(); let vm = Vm::new(&kvm, gm).unwrap(); vm.create_irq_chip().unwrap(); let vcpu = Vcpu::new(0, &kvm, &vm).unwrap(); let state = vcpu.get_mp_state().unwrap(); vcpu.set_mp_state(&state).unwrap(); } #[test] fn set_signal_mask() { let kvm = Kvm::new().unwrap(); let gm = GuestMemory::new(&vec![(GuestAddress(0), 0x10000)]).unwrap(); let vm = Vm::new(&kvm, gm).unwrap(); let vcpu = Vcpu::new(0, &kvm, &vm).unwrap(); vcpu.set_signal_mask(&[base::SIGRTMIN() + 0]).unwrap(); } #[test] fn vcpu_mmap_size() { let kvm = Kvm::new().unwrap(); let mmap_size = kvm.get_vcpu_mmap_size().unwrap(); let page_size = pagesize(); assert!(mmap_size >= page_size); assert!(mmap_size % page_size == 0); } #[test] #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] fn set_identity_map_addr() { let kvm = Kvm::new().unwrap(); let gm = GuestMemory::new(&vec![(GuestAddress(0), 0x10000)]).unwrap(); let vm = Vm::new(&kvm, gm).unwrap(); vm.set_identity_map_addr(GuestAddress(0x20000)).unwrap(); } }