• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2020 The ChromiumOS Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 use std::arch::x86_64::CpuidResult;
6 #[cfg(any(unix, feature = "haxm", feature = "whpx"))]
7 use std::arch::x86_64::__cpuid;
8 use std::arch::x86_64::_rdtsc;
9 use std::collections::BTreeMap;
10 use std::collections::HashSet;
11 
12 use anyhow::Context;
13 use base::custom_serde::deserialize_seq_to_arr;
14 use base::custom_serde::serialize_arr;
15 use base::error;
16 use base::warn;
17 use base::Result;
18 use bit_field::*;
19 use downcast_rs::impl_downcast;
20 use libc::c_void;
21 use serde::Deserialize;
22 use serde::Serialize;
23 use snapshot::AnySnapshot;
24 use vm_memory::GuestAddress;
25 
26 use crate::Hypervisor;
27 use crate::IrqRoute;
28 use crate::IrqSource;
29 use crate::IrqSourceChip;
30 use crate::Vcpu;
31 use crate::Vm;
32 
33 const MSR_F15H_PERF_CTL0: u32 = 0xc0010200;
34 const MSR_F15H_PERF_CTL1: u32 = 0xc0010202;
35 const MSR_F15H_PERF_CTL2: u32 = 0xc0010204;
36 const MSR_F15H_PERF_CTL3: u32 = 0xc0010206;
37 const MSR_F15H_PERF_CTL4: u32 = 0xc0010208;
38 const MSR_F15H_PERF_CTL5: u32 = 0xc001020a;
39 const MSR_F15H_PERF_CTR0: u32 = 0xc0010201;
40 const MSR_F15H_PERF_CTR1: u32 = 0xc0010203;
41 const MSR_F15H_PERF_CTR2: u32 = 0xc0010205;
42 const MSR_F15H_PERF_CTR3: u32 = 0xc0010207;
43 const MSR_F15H_PERF_CTR4: u32 = 0xc0010209;
44 const MSR_F15H_PERF_CTR5: u32 = 0xc001020b;
45 const MSR_IA32_PERF_CAPABILITIES: u32 = 0x00000345;
46 
47 /// A trait for managing cpuids for an x86_64 hypervisor and for checking its capabilities.
48 pub trait HypervisorX86_64: Hypervisor {
49     /// Get the system supported CPUID values.
get_supported_cpuid(&self) -> Result<CpuId>50     fn get_supported_cpuid(&self) -> Result<CpuId>;
51 
52     /// Gets the list of supported MSRs.
get_msr_index_list(&self) -> Result<Vec<u32>>53     fn get_msr_index_list(&self) -> Result<Vec<u32>>;
54 }
55 
56 /// A wrapper for using a VM on x86_64 and getting/setting its state.
57 pub trait VmX86_64: Vm {
58     /// Gets the `HypervisorX86_64` that created this VM.
get_hypervisor(&self) -> &dyn HypervisorX86_6459     fn get_hypervisor(&self) -> &dyn HypervisorX86_64;
60 
61     /// Create a Vcpu with the specified Vcpu ID.
create_vcpu(&self, id: usize) -> Result<Box<dyn VcpuX86_64>>62     fn create_vcpu(&self, id: usize) -> Result<Box<dyn VcpuX86_64>>;
63 
64     /// Sets the address of the three-page region in the VM's address space.
set_tss_addr(&self, addr: GuestAddress) -> Result<()>65     fn set_tss_addr(&self, addr: GuestAddress) -> Result<()>;
66 
67     /// Sets the address of a one-page region in the VM's address space.
set_identity_map_addr(&self, addr: GuestAddress) -> Result<()>68     fn set_identity_map_addr(&self, addr: GuestAddress) -> Result<()>;
69 
70     /// Load pVM firmware for the VM, creating a memslot for it as needed.
71     ///
72     /// Only works on protected VMs (i.e. those with vm_type == KVM_X86_PKVM_PROTECTED_VM).
load_protected_vm_firmware(&mut self, fw_addr: GuestAddress, fw_max_size: u64) -> Result<()>73     fn load_protected_vm_firmware(&mut self, fw_addr: GuestAddress, fw_max_size: u64)
74         -> Result<()>;
75 }
76 
77 /// A wrapper around creating and using a VCPU on x86_64.
78 pub trait VcpuX86_64: Vcpu {
79     /// Sets or clears the flag that requests the VCPU to exit when it becomes possible to inject
80     /// interrupts into the guest.
set_interrupt_window_requested(&self, requested: bool)81     fn set_interrupt_window_requested(&self, requested: bool);
82 
83     /// Checks if we can inject an interrupt into the VCPU.
ready_for_interrupt(&self) -> bool84     fn ready_for_interrupt(&self) -> bool;
85 
86     /// Injects interrupt vector `irq` into the VCPU.
87     ///
88     /// This function should only be called when [`Self::ready_for_interrupt`] returns true.
89     /// Otherwise the interrupt injection may fail or the next VCPU run may fail. However, if
90     /// [`Self::interrupt`] returns [`Ok`], the implementation must guarantee that the interrupt
91     /// isn't injected in an uninterruptible window (e.g. right after the mov ss instruction).
92     ///
93     /// The caller should avoid calling this function more than 1 time for one VMEXIT, because the
94     /// hypervisor may behave differently: some hypervisors(e.g. WHPX, KVM) will only try to inject
95     /// the last `irq` requested, while some other hypervisors(e.g. HAXM) may try to inject all
96     /// `irq`s requested.
interrupt(&self, irq: u8) -> Result<()>97     fn interrupt(&self, irq: u8) -> Result<()>;
98 
99     /// Injects a non-maskable interrupt into the VCPU.
inject_nmi(&self) -> Result<()>100     fn inject_nmi(&self) -> Result<()>;
101 
102     /// Gets the VCPU general purpose registers.
get_regs(&self) -> Result<Regs>103     fn get_regs(&self) -> Result<Regs>;
104 
105     /// Sets the VCPU general purpose registers.
set_regs(&self, regs: &Regs) -> Result<()>106     fn set_regs(&self, regs: &Regs) -> Result<()>;
107 
108     /// Gets the VCPU special registers.
get_sregs(&self) -> Result<Sregs>109     fn get_sregs(&self) -> Result<Sregs>;
110 
111     /// Sets the VCPU special registers.
set_sregs(&self, sregs: &Sregs) -> Result<()>112     fn set_sregs(&self, sregs: &Sregs) -> Result<()>;
113 
114     /// Gets the VCPU FPU registers.
get_fpu(&self) -> Result<Fpu>115     fn get_fpu(&self) -> Result<Fpu>;
116 
117     /// Sets the VCPU FPU registers.
set_fpu(&self, fpu: &Fpu) -> Result<()>118     fn set_fpu(&self, fpu: &Fpu) -> Result<()>;
119 
120     /// Gets the VCPU debug registers.
get_debugregs(&self) -> Result<DebugRegs>121     fn get_debugregs(&self) -> Result<DebugRegs>;
122 
123     /// Sets the VCPU debug registers.
set_debugregs(&self, debugregs: &DebugRegs) -> Result<()>124     fn set_debugregs(&self, debugregs: &DebugRegs) -> Result<()>;
125 
126     /// Gets the VCPU extended control registers.
get_xcrs(&self) -> Result<BTreeMap<u32, u64>>127     fn get_xcrs(&self) -> Result<BTreeMap<u32, u64>>;
128 
129     /// Sets a VCPU extended control register.
set_xcr(&self, xcr: u32, value: u64) -> Result<()>130     fn set_xcr(&self, xcr: u32, value: u64) -> Result<()>;
131 
132     /// Gets the VCPU x87 FPU, MMX, XMM, YMM and MXCSR registers.
get_xsave(&self) -> Result<Xsave>133     fn get_xsave(&self) -> Result<Xsave>;
134 
135     /// Sets the VCPU x87 FPU, MMX, XMM, YMM and MXCSR registers.
set_xsave(&self, xsave: &Xsave) -> Result<()>136     fn set_xsave(&self, xsave: &Xsave) -> Result<()>;
137 
138     /// Gets interrupt state (hypervisor specific) for this VCPU that must be
139     /// saved/restored for snapshotting.
get_interrupt_state(&self) -> Result<AnySnapshot>140     fn get_interrupt_state(&self) -> Result<AnySnapshot>;
141 
142     /// Sets interrupt state (hypervisor specific) for this VCPU. Only used for
143     /// snapshotting.
set_interrupt_state(&self, data: AnySnapshot) -> Result<()>144     fn set_interrupt_state(&self, data: AnySnapshot) -> Result<()>;
145 
146     /// Gets a single model-specific register's value.
get_msr(&self, msr_index: u32) -> Result<u64>147     fn get_msr(&self, msr_index: u32) -> Result<u64>;
148 
149     /// Gets the model-specific registers. Returns all the MSRs for the VCPU.
get_all_msrs(&self) -> Result<BTreeMap<u32, u64>>150     fn get_all_msrs(&self) -> Result<BTreeMap<u32, u64>>;
151 
152     /// Sets a single model-specific register's value.
set_msr(&self, msr_index: u32, value: u64) -> Result<()>153     fn set_msr(&self, msr_index: u32, value: u64) -> Result<()>;
154 
155     /// Sets up the data returned by the CPUID instruction.
set_cpuid(&self, cpuid: &CpuId) -> Result<()>156     fn set_cpuid(&self, cpuid: &CpuId) -> Result<()>;
157 
158     /// Sets up debug registers and configure vcpu for handling guest debug events.
set_guest_debug(&self, addrs: &[GuestAddress], enable_singlestep: bool) -> Result<()>159     fn set_guest_debug(&self, addrs: &[GuestAddress], enable_singlestep: bool) -> Result<()>;
160 
161     /// This function should be called after `Vcpu::run` returns `VcpuExit::Cpuid`, and `entry`
162     /// should represent the result of emulating the CPUID instruction. The `handle_cpuid` function
163     /// will then set the appropriate registers on the vcpu.
handle_cpuid(&mut self, entry: &CpuIdEntry) -> Result<()>164     fn handle_cpuid(&mut self, entry: &CpuIdEntry) -> Result<()>;
165 
166     /// Gets the guest->host TSC offset.
167     ///
168     /// The default implementation uses [`VcpuX86_64::get_msr()`] to read the guest TSC.
get_tsc_offset(&self) -> Result<u64>169     fn get_tsc_offset(&self) -> Result<u64> {
170         // SAFETY:
171         // Safe because _rdtsc takes no arguments
172         let host_before_tsc = unsafe { _rdtsc() };
173 
174         // get guest TSC value from our hypervisor
175         let guest_tsc = self.get_msr(crate::MSR_IA32_TSC)?;
176 
177         // SAFETY:
178         // Safe because _rdtsc takes no arguments
179         let host_after_tsc = unsafe { _rdtsc() };
180 
181         // Average the before and after host tsc to get the best value
182         let host_tsc = ((host_before_tsc as u128 + host_after_tsc as u128) / 2) as u64;
183 
184         Ok(guest_tsc.wrapping_sub(host_tsc))
185     }
186 
187     /// Sets the guest->host TSC offset.
188     ///
189     /// The default implementation uses [`VcpuX86_64::set_tsc_value()`] to set the TSC value.
190     ///
191     /// It sets TSC_OFFSET (VMCS / CB field) by setting the TSC MSR to the current
192     /// host TSC value plus the desired offset. We rely on the fact that hypervisors
193     /// determine the value of TSC_OFFSET by computing TSC_OFFSET = `new_tsc_value - _rdtsc()` =
194     /// `_rdtsc() + offset - _rdtsc()` ~= `offset`. Note that the ~= is important: this is an
195     /// approximate operation, because the two _rdtsc() calls
196     /// are separated by at least a few ticks.
197     ///
198     /// Note: TSC_OFFSET, host TSC, guest TSC, and TSC MSR are all different
199     /// concepts.
200     /// * When a guest executes rdtsc, the value (guest TSC) returned is host_tsc * TSC_MULTIPLIER +
201     ///   TSC_OFFSET + TSC_ADJUST.
202     /// * The TSC MSR is a special MSR that when written to by the host, will cause TSC_OFFSET to be
203     ///   set accordingly by the hypervisor.
204     /// * When the guest *writes* to TSC MSR, it actually changes the TSC_ADJUST MSR *for the
205     ///   guest*. Generally this is only happens if the guest is trying to re-zero or synchronize
206     ///   TSCs.
set_tsc_offset(&self, offset: u64) -> Result<()>207     fn set_tsc_offset(&self, offset: u64) -> Result<()> {
208         // SAFETY: _rdtsc takes no arguments.
209         let host_tsc = unsafe { _rdtsc() };
210         self.set_tsc_value(host_tsc.wrapping_add(offset))
211     }
212 
213     /// Sets the guest TSC exactly to the provided value.
214     ///
215     /// The default implementation sets the guest's TSC by writing the value to the MSR directly.
216     ///
217     /// See [`VcpuX86_64::set_tsc_offset()`] for an explanation of how this value is actually read
218     /// by the guest after being set.
set_tsc_value(&self, value: u64) -> Result<()>219     fn set_tsc_value(&self, value: u64) -> Result<()> {
220         self.set_msr(crate::MSR_IA32_TSC, value)
221     }
222 
223     /// Some hypervisors require special handling to restore timekeeping when
224     /// a snapshot is restored. They are provided with a host TSC reference
225     /// moment, guaranteed to be the same across all Vcpus, and the Vcpu's TSC
226     /// offset at the moment it was snapshotted.
restore_timekeeping(&self, host_tsc_reference_moment: u64, tsc_offset: u64) -> Result<()>227     fn restore_timekeeping(&self, host_tsc_reference_moment: u64, tsc_offset: u64) -> Result<()>;
228 
229     /// Snapshot vCPU state
snapshot(&self) -> anyhow::Result<VcpuSnapshot>230     fn snapshot(&self) -> anyhow::Result<VcpuSnapshot> {
231         Ok(VcpuSnapshot {
232             vcpu_id: self.id(),
233             regs: self.get_regs()?,
234             sregs: self.get_sregs()?,
235             debug_regs: self.get_debugregs()?,
236             xcrs: self.get_xcrs()?,
237             msrs: self.get_all_msrs()?,
238             xsave: self.get_xsave()?,
239             hypervisor_data: self.get_interrupt_state()?,
240             tsc_offset: self.get_tsc_offset()?,
241         })
242     }
243 
restore( &mut self, snapshot: &VcpuSnapshot, host_tsc_reference_moment: u64, ) -> anyhow::Result<()>244     fn restore(
245         &mut self,
246         snapshot: &VcpuSnapshot,
247         host_tsc_reference_moment: u64,
248     ) -> anyhow::Result<()> {
249         // List of MSRs that may fail to restore due to lack of support in the host kernel.
250         // Some hosts are may be running older kernels which do not support all MSRs, but
251         // get_all_msrs will still fetch the MSRs supported by the CPU. Trying to set those MSRs
252         // will result in failures, so they will throw a warning instead.
253         let msr_allowlist = HashSet::from([
254             MSR_F15H_PERF_CTL0,
255             MSR_F15H_PERF_CTL1,
256             MSR_F15H_PERF_CTL2,
257             MSR_F15H_PERF_CTL3,
258             MSR_F15H_PERF_CTL4,
259             MSR_F15H_PERF_CTL5,
260             MSR_F15H_PERF_CTR0,
261             MSR_F15H_PERF_CTR1,
262             MSR_F15H_PERF_CTR2,
263             MSR_F15H_PERF_CTR3,
264             MSR_F15H_PERF_CTR4,
265             MSR_F15H_PERF_CTR5,
266             MSR_IA32_PERF_CAPABILITIES,
267         ]);
268         assert_eq!(snapshot.vcpu_id, self.id());
269         self.set_regs(&snapshot.regs)?;
270         self.set_sregs(&snapshot.sregs)?;
271         self.set_debugregs(&snapshot.debug_regs)?;
272         for (xcr_index, value) in &snapshot.xcrs {
273             self.set_xcr(*xcr_index, *value)?;
274         }
275 
276         for (msr_index, value) in snapshot.msrs.iter() {
277             if self.get_msr(*msr_index) == Ok(*value) {
278                 continue; // no need to set MSR since the values are the same.
279             }
280             if let Err(e) = self.set_msr(*msr_index, *value) {
281                 if msr_allowlist.contains(msr_index) {
282                     warn!(
283                         "Failed to set MSR. MSR might not be supported in this kernel. Err: {}",
284                         e
285                     );
286                 } else {
287                     return Err(e).context(
288                         "Failed to set MSR. MSR might not be supported by the CPU or by the kernel,
289                          and was not allow-listed.",
290                     );
291                 }
292             };
293         }
294         self.set_xsave(&snapshot.xsave)?;
295         self.set_interrupt_state(snapshot.hypervisor_data.clone())?;
296         self.restore_timekeeping(host_tsc_reference_moment, snapshot.tsc_offset)?;
297         Ok(())
298     }
299 }
300 
301 /// x86 specific vCPU snapshot.
302 #[derive(Clone, Debug, Serialize, Deserialize)]
303 pub struct VcpuSnapshot {
304     pub vcpu_id: usize,
305     regs: Regs,
306     sregs: Sregs,
307     debug_regs: DebugRegs,
308     xcrs: BTreeMap<u32, u64>,
309     msrs: BTreeMap<u32, u64>,
310     xsave: Xsave,
311     hypervisor_data: AnySnapshot,
312     tsc_offset: u64,
313 }
314 
315 impl_downcast!(VcpuX86_64);
316 
317 // TSC MSR
318 pub const MSR_IA32_TSC: u32 = 0x00000010;
319 
320 /// Gets host cpu max physical address bits.
321 #[cfg(any(unix, feature = "haxm", feature = "whpx"))]
host_phys_addr_bits() -> u8322 pub(crate) fn host_phys_addr_bits() -> u8 {
323     // SAFETY: trivially safe
324     let highest_ext_function = unsafe { __cpuid(0x80000000) };
325     if highest_ext_function.eax >= 0x80000008 {
326         // SAFETY: trivially safe
327         let addr_size = unsafe { __cpuid(0x80000008) };
328         // Low 8 bits of 0x80000008 leaf: host physical address size in bits.
329         addr_size.eax as u8
330     } else {
331         36
332     }
333 }
334 
335 /// Initial state for x86_64 VCPUs.
336 #[derive(Clone, Default)]
337 pub struct VcpuInitX86_64 {
338     /// General-purpose registers.
339     pub regs: Regs,
340 
341     /// Special registers.
342     pub sregs: Sregs,
343 
344     /// Floating-point registers.
345     pub fpu: Fpu,
346 
347     /// Machine-specific registers.
348     pub msrs: BTreeMap<u32, u64>,
349 }
350 
351 /// Hold the CPU feature configurations that are needed to setup a vCPU.
352 #[derive(Clone, Debug, PartialEq, Eq)]
353 pub struct CpuConfigX86_64 {
354     /// whether to force using a calibrated TSC leaf (0x15).
355     pub force_calibrated_tsc_leaf: bool,
356 
357     /// whether enabling host cpu topology.
358     pub host_cpu_topology: bool,
359 
360     /// whether expose HWP feature to the guest.
361     pub enable_hwp: bool,
362 
363     /// Wheter diabling SMT (Simultaneous Multithreading).
364     pub no_smt: bool,
365 
366     /// whether enabling ITMT scheduler
367     pub itmt: bool,
368 
369     /// whether setting hybrid CPU type
370     pub hybrid_type: Option<CpuHybridType>,
371 }
372 
373 impl CpuConfigX86_64 {
new( force_calibrated_tsc_leaf: bool, host_cpu_topology: bool, enable_hwp: bool, no_smt: bool, itmt: bool, hybrid_type: Option<CpuHybridType>, ) -> Self374     pub fn new(
375         force_calibrated_tsc_leaf: bool,
376         host_cpu_topology: bool,
377         enable_hwp: bool,
378         no_smt: bool,
379         itmt: bool,
380         hybrid_type: Option<CpuHybridType>,
381     ) -> Self {
382         CpuConfigX86_64 {
383             force_calibrated_tsc_leaf,
384             host_cpu_topology,
385             enable_hwp,
386             no_smt,
387             itmt,
388             hybrid_type,
389         }
390     }
391 }
392 
393 /// A CpuId Entry contains supported feature information for the given processor.
394 /// This can be modified by the hypervisor to pass additional information to the guest kernel
395 /// about the hypervisor or vm. Information is returned in the eax, ebx, ecx and edx registers
396 /// by the cpu for a given function and index/subfunction (passed into the cpu via the eax and ecx
397 /// register respectively).
398 #[repr(C)]
399 #[derive(Clone, Copy, Debug, PartialEq, Eq)]
400 pub struct CpuIdEntry {
401     pub function: u32,
402     pub index: u32,
403     // flags is needed for KVM.  We store it on CpuIdEntry to preserve the flags across
404     // get_supported_cpuids() -> kvm_cpuid2 -> CpuId -> kvm_cpuid2 -> set_cpuid().
405     pub flags: u32,
406     pub cpuid: CpuidResult,
407 }
408 
409 /// A container for the list of cpu id entries for the hypervisor and underlying cpu.
410 pub struct CpuId {
411     pub cpu_id_entries: Vec<CpuIdEntry>,
412 }
413 
414 impl CpuId {
415     /// Constructs a new CpuId, with space allocated for `initial_capacity` CpuIdEntries.
new(initial_capacity: usize) -> Self416     pub fn new(initial_capacity: usize) -> Self {
417         CpuId {
418             cpu_id_entries: Vec::with_capacity(initial_capacity),
419         }
420     }
421 }
422 
423 #[bitfield]
424 #[derive(Clone, Copy, Debug, PartialEq, Eq)]
425 pub enum DestinationMode {
426     Physical = 0,
427     Logical = 1,
428 }
429 
430 #[bitfield]
431 #[derive(Clone, Copy, Debug, PartialEq, Eq)]
432 pub enum TriggerMode {
433     Edge = 0,
434     Level = 1,
435 }
436 
437 #[bitfield]
438 #[derive(Debug, Clone, Copy, PartialEq, Eq)]
439 pub enum DeliveryMode {
440     Fixed = 0b000,
441     Lowest = 0b001,
442     SMI = 0b010,        // System management interrupt
443     RemoteRead = 0b011, // This is no longer supported by intel.
444     NMI = 0b100,        // Non maskable interrupt
445     Init = 0b101,
446     Startup = 0b110,
447     External = 0b111,
448 }
449 
450 // These MSI structures are for Intel's implementation of MSI.  The PCI spec defines most of MSI,
451 // but the Intel spec defines the format of messages for raising interrupts.  The PCI spec defines
452 // three u32s -- the address, address_high, and data -- but Intel only makes use of the address and
453 // data.  The Intel portion of the specification is in Volume 3 section 10.11.
454 #[bitfield]
455 #[derive(Clone, Copy, PartialEq, Eq)]
456 pub struct MsiAddressMessage {
457     pub reserved: BitField2,
458     #[bits = 1]
459     pub destination_mode: DestinationMode,
460     pub redirection_hint: BitField1,
461     pub reserved_2: BitField8,
462     pub destination_id: BitField8,
463     // According to Intel's implementation of MSI, these bits must always be 0xfee.
464     pub always_0xfee: BitField12,
465 }
466 
467 #[bitfield]
468 #[derive(Clone, Copy, PartialEq, Eq)]
469 pub struct MsiDataMessage {
470     pub vector: BitField8,
471     #[bits = 3]
472     pub delivery_mode: DeliveryMode,
473     pub reserved: BitField3,
474     #[bits = 1]
475     pub level: Level,
476     #[bits = 1]
477     pub trigger: TriggerMode,
478     pub reserved2: BitField16,
479 }
480 
481 #[bitfield]
482 #[derive(Debug, Clone, Copy, PartialEq, Eq)]
483 pub enum DeliveryStatus {
484     Idle = 0,
485     Pending = 1,
486 }
487 
488 /// The level of a level-triggered interrupt: asserted or deasserted.
489 #[bitfield]
490 #[derive(Debug, Clone, Copy, PartialEq, Eq)]
491 pub enum Level {
492     Deassert = 0,
493     Assert = 1,
494 }
495 
496 /// Represents a IOAPIC redirection table entry.
497 #[bitfield]
498 #[derive(Clone, Copy, Default, PartialEq, Eq, Serialize, Deserialize)]
499 pub struct IoapicRedirectionTableEntry {
500     vector: BitField8,
501     #[bits = 3]
502     delivery_mode: DeliveryMode,
503     #[bits = 1]
504     dest_mode: DestinationMode,
505     #[bits = 1]
506     delivery_status: DeliveryStatus,
507     polarity: BitField1,
508     remote_irr: bool,
509     #[bits = 1]
510     trigger_mode: TriggerMode,
511     interrupt_mask: bool, // true iff interrupts are masked.
512     reserved: BitField39,
513     dest_id: BitField8,
514 }
515 
516 /// Number of pins on the standard KVM/IOAPIC.
517 pub const NUM_IOAPIC_PINS: usize = 24;
518 
519 /// Represents the state of the IOAPIC.
520 #[repr(C)]
521 #[derive(Clone, Copy, Debug, PartialEq, Eq, Serialize, Deserialize)]
522 pub struct IoapicState {
523     /// base_address is the memory base address for this IOAPIC. It cannot be changed.
524     pub base_address: u64,
525     /// ioregsel register. Used for selecting which entry of the redirect table to read/write.
526     pub ioregsel: u8,
527     /// ioapicid register. Bits 24 - 27 contain the APIC ID for this device.
528     pub ioapicid: u32,
529     /// current_interrupt_level_bitmap represents a bitmap of the state of all of the irq lines
530     pub current_interrupt_level_bitmap: u32,
531     /// redirect_table contains the irq settings for each irq line
532     #[serde(
533         serialize_with = "serialize_arr",
534         deserialize_with = "deserialize_seq_to_arr"
535     )]
536     pub redirect_table: [IoapicRedirectionTableEntry; NUM_IOAPIC_PINS],
537 }
538 
539 impl Default for IoapicState {
default() -> IoapicState540     fn default() -> IoapicState {
541         // SAFETY: trivially safe
542         unsafe { std::mem::zeroed() }
543     }
544 }
545 
546 #[repr(C)]
547 #[derive(Debug, Clone, Copy, PartialEq, Eq)]
548 pub enum PicSelect {
549     Primary = 0,
550     Secondary = 1,
551 }
552 
553 #[repr(C)]
554 #[derive(enumn::N, Debug, Clone, Copy, Default, PartialEq, Eq, Serialize, Deserialize)]
555 pub enum PicInitState {
556     #[default]
557     Icw1 = 0,
558     Icw2 = 1,
559     Icw3 = 2,
560     Icw4 = 3,
561 }
562 
563 /// Convenience implementation for converting from a u8
564 impl From<u8> for PicInitState {
from(item: u8) -> Self565     fn from(item: u8) -> Self {
566         PicInitState::n(item).unwrap_or_else(|| {
567             error!("Invalid PicInitState {}, setting to 0", item);
568             PicInitState::Icw1
569         })
570     }
571 }
572 
573 /// Represents the state of the PIC.
574 #[repr(C)]
575 #[derive(Clone, Copy, Default, Debug, PartialEq, Eq, Serialize, Deserialize)]
576 pub struct PicState {
577     /// Edge detection.
578     pub last_irr: u8,
579     /// Interrupt Request Register.
580     pub irr: u8,
581     /// Interrupt Mask Register.
582     pub imr: u8,
583     /// Interrupt Service Register.
584     pub isr: u8,
585     /// Highest priority, for priority rotation.
586     pub priority_add: u8,
587     pub irq_base: u8,
588     pub read_reg_select: bool,
589     pub poll: bool,
590     pub special_mask: bool,
591     pub init_state: PicInitState,
592     pub auto_eoi: bool,
593     pub rotate_on_auto_eoi: bool,
594     pub special_fully_nested_mode: bool,
595     /// PIC takes either 3 or 4 bytes of initialization command word during
596     /// initialization. use_4_byte_icw is true if 4 bytes of ICW are needed.
597     pub use_4_byte_icw: bool,
598     /// "Edge/Level Control Registers", for edge trigger selection.
599     /// When a particular bit is set, the corresponding IRQ is in level-triggered mode. Otherwise
600     /// it is in edge-triggered mode.
601     pub elcr: u8,
602     pub elcr_mask: u8,
603 }
604 
605 /// The LapicState represents the state of an x86 CPU's Local APIC.
606 /// The Local APIC consists of 64 128-bit registers, but only the first 32-bits of each register
607 /// can be used, so this structure only stores the first 32-bits of each register.
608 #[repr(C)]
609 #[derive(Clone, Copy, Serialize, Deserialize)]
610 pub struct LapicState {
611     #[serde(
612         serialize_with = "serialize_arr",
613         deserialize_with = "deserialize_seq_to_arr"
614     )]
615     pub regs: [LapicRegister; 64],
616 }
617 
618 pub type LapicRegister = u32;
619 
620 // rust arrays longer than 32 need custom implementations of Debug
621 impl std::fmt::Debug for LapicState {
fmt(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result622     fn fmt(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
623         self.regs[..].fmt(formatter)
624     }
625 }
626 
627 // rust arrays longer than 32 need custom implementations of PartialEq
628 impl PartialEq for LapicState {
eq(&self, other: &LapicState) -> bool629     fn eq(&self, other: &LapicState) -> bool {
630         self.regs[..] == other.regs[..]
631     }
632 }
633 
634 // Lapic equality is reflexive, so we impl Eq
635 impl Eq for LapicState {}
636 
637 /// The PitState represents the state of the PIT (aka the Programmable Interval Timer).
638 /// The state is simply the state of it's three channels.
639 #[repr(C)]
640 #[derive(Clone, Copy, Debug, PartialEq, Eq, Serialize, Deserialize)]
641 pub struct PitState {
642     pub channels: [PitChannelState; 3],
643     /// Hypervisor-specific flags for setting the pit state.
644     pub flags: u32,
645 }
646 
647 /// The PitRWMode enum represents the access mode of a PIT channel.
648 /// Reads and writes to the Pit happen over Port-mapped I/O, which happens one byte at a time,
649 /// but the count values and latch values are two bytes. So the access mode controls which of the
650 /// two bytes will be read when.
651 #[repr(C)]
652 #[derive(enumn::N, Clone, Copy, Debug, PartialEq, Eq, Serialize, Deserialize)]
653 pub enum PitRWMode {
654     /// None mode means that no access mode has been set.
655     None = 0,
656     /// Least mode means all reads/writes will read/write the least significant byte.
657     Least = 1,
658     /// Most mode means all reads/writes will read/write the most significant byte.
659     Most = 2,
660     /// Both mode means first the least significant byte will be read/written, then the
661     /// next read/write will read/write the most significant byte.
662     Both = 3,
663 }
664 
665 /// Convenience implementation for converting from a u8
666 impl From<u8> for PitRWMode {
from(item: u8) -> Self667     fn from(item: u8) -> Self {
668         PitRWMode::n(item).unwrap_or_else(|| {
669             error!("Invalid PitRWMode value {}, setting to 0", item);
670             PitRWMode::None
671         })
672     }
673 }
674 
675 /// The PitRWState enum represents the state of reading to or writing from a channel.
676 /// This is related to the PitRWMode, it mainly gives more detail about the state of the channel
677 /// with respect to PitRWMode::Both.
678 #[repr(C)]
679 #[derive(enumn::N, Clone, Copy, Debug, PartialEq, Eq, Serialize, Deserialize)]
680 pub enum PitRWState {
681     /// None mode means that no access mode has been set.
682     None = 0,
683     /// LSB means that the channel is in PitRWMode::Least access mode.
684     LSB = 1,
685     /// MSB means that the channel is in PitRWMode::Most access mode.
686     MSB = 2,
687     /// Word0 means that the channel is in PitRWMode::Both mode, and the least sginificant byte
688     /// has not been read/written yet.
689     Word0 = 3,
690     /// Word1 means that the channel is in PitRWMode::Both mode and the least significant byte
691     /// has already been read/written, and the next byte to be read/written will be the most
692     /// significant byte.
693     Word1 = 4,
694 }
695 
696 /// Convenience implementation for converting from a u8
697 impl From<u8> for PitRWState {
from(item: u8) -> Self698     fn from(item: u8) -> Self {
699         PitRWState::n(item).unwrap_or_else(|| {
700             error!("Invalid PitRWState value {}, setting to 0", item);
701             PitRWState::None
702         })
703     }
704 }
705 
706 /// The PitChannelState represents the state of one of the PIT's three counters.
707 #[repr(C)]
708 #[derive(Clone, Copy, Debug, PartialEq, Eq, Serialize, Deserialize)]
709 pub struct PitChannelState {
710     /// The starting value for the counter.
711     pub count: u32,
712     /// Stores the channel count from the last time the count was latched.
713     pub latched_count: u16,
714     /// Indicates the PitRWState state of reading the latch value.
715     pub count_latched: PitRWState,
716     /// Indicates whether ReadBack status has been latched.
717     pub status_latched: bool,
718     /// Stores the channel status from the last time the status was latched. The status contains
719     /// information about the access mode of this channel, but changing those bits in the status
720     /// will not change the behavior of the pit.
721     pub status: u8,
722     /// Indicates the PitRWState state of reading the counter.
723     pub read_state: PitRWState,
724     /// Indicates the PitRWState state of writing the counter.
725     pub write_state: PitRWState,
726     /// Stores the value with which the counter was initialized. Counters are 16-
727     /// bit values with an effective range of 1-65536 (65536 represented by 0).
728     pub reload_value: u16,
729     /// The command access mode of this channel.
730     pub rw_mode: PitRWMode,
731     /// The operation mode of this channel.
732     pub mode: u8,
733     /// Whether or not we are in bcd mode. Not supported by KVM or crosvm's PIT implementation.
734     pub bcd: bool,
735     /// Value of the gate input pin. This only applies to channel 2.
736     pub gate: bool,
737     /// Nanosecond timestamp of when the count value was loaded.
738     pub count_load_time: u64,
739 }
740 
741 // Convenience constructors for IrqRoutes
742 impl IrqRoute {
ioapic_irq_route(irq_num: u32) -> IrqRoute743     pub fn ioapic_irq_route(irq_num: u32) -> IrqRoute {
744         IrqRoute {
745             gsi: irq_num,
746             source: IrqSource::Irqchip {
747                 chip: IrqSourceChip::Ioapic,
748                 pin: irq_num,
749             },
750         }
751     }
752 
pic_irq_route(id: IrqSourceChip, irq_num: u32) -> IrqRoute753     pub fn pic_irq_route(id: IrqSourceChip, irq_num: u32) -> IrqRoute {
754         IrqRoute {
755             gsi: irq_num,
756             source: IrqSource::Irqchip {
757                 chip: id,
758                 pin: irq_num % 8,
759             },
760         }
761     }
762 }
763 
764 /// State of a VCPU's general purpose registers.
765 #[repr(C)]
766 #[derive(Debug, Copy, Clone, Serialize, Deserialize)]
767 pub struct Regs {
768     pub rax: u64,
769     pub rbx: u64,
770     pub rcx: u64,
771     pub rdx: u64,
772     pub rsi: u64,
773     pub rdi: u64,
774     pub rsp: u64,
775     pub rbp: u64,
776     pub r8: u64,
777     pub r9: u64,
778     pub r10: u64,
779     pub r11: u64,
780     pub r12: u64,
781     pub r13: u64,
782     pub r14: u64,
783     pub r15: u64,
784     pub rip: u64,
785     pub rflags: u64,
786 }
787 
788 impl Default for Regs {
default() -> Self789     fn default() -> Self {
790         Regs {
791             rax: 0,
792             rbx: 0,
793             rcx: 0,
794             rdx: 0,
795             rsi: 0,
796             rdi: 0,
797             rsp: 0,
798             rbp: 0,
799             r8: 0,
800             r9: 0,
801             r10: 0,
802             r11: 0,
803             r12: 0,
804             r13: 0,
805             r14: 0,
806             r15: 0,
807             rip: 0xfff0, // Reset vector.
808             rflags: 0x2, // Bit 1 (0x2) is always 1.
809         }
810     }
811 }
812 
813 /// State of a memory segment.
814 #[repr(C)]
815 #[derive(Debug, Default, Copy, Clone, Serialize, Deserialize, PartialEq, Eq)]
816 pub struct Segment {
817     pub base: u64,
818     /// Limit of the segment - always in bytes, regardless of granularity (`g`) field.
819     pub limit_bytes: u32,
820     pub selector: u16,
821     pub type_: u8,
822     pub present: u8,
823     pub dpl: u8,
824     pub db: u8,
825     pub s: u8,
826     pub l: u8,
827     pub g: u8,
828     pub avl: u8,
829 }
830 
831 /// State of a global descriptor table or interrupt descriptor table.
832 #[repr(C)]
833 #[derive(Debug, Default, Copy, Clone, Serialize, Deserialize)]
834 pub struct DescriptorTable {
835     pub base: u64,
836     pub limit: u16,
837 }
838 
839 /// State of a VCPU's special registers.
840 #[repr(C)]
841 #[derive(Debug, Copy, Clone, Serialize, Deserialize)]
842 pub struct Sregs {
843     pub cs: Segment,
844     pub ds: Segment,
845     pub es: Segment,
846     pub fs: Segment,
847     pub gs: Segment,
848     pub ss: Segment,
849     pub tr: Segment,
850     pub ldt: Segment,
851     pub gdt: DescriptorTable,
852     pub idt: DescriptorTable,
853     pub cr0: u64,
854     pub cr2: u64,
855     pub cr3: u64,
856     pub cr4: u64,
857     pub cr8: u64,
858     pub efer: u64,
859 }
860 
861 impl Default for Sregs {
default() -> Self862     fn default() -> Self {
863         // Intel SDM Vol. 3A, 3.4.5.1 ("Code- and Data-Segment Descriptor Types")
864         const SEG_TYPE_DATA: u8 = 0b0000;
865         const SEG_TYPE_DATA_WRITABLE: u8 = 0b0010;
866 
867         const SEG_TYPE_CODE: u8 = 0b1000;
868         const SEG_TYPE_CODE_READABLE: u8 = 0b0010;
869 
870         const SEG_TYPE_ACCESSED: u8 = 0b0001;
871 
872         // Intel SDM Vol. 3A, 3.4.5 ("Segment Descriptors")
873         const SEG_S_SYSTEM: u8 = 0; // System segment.
874         const SEG_S_CODE_OR_DATA: u8 = 1; // Data/code segment.
875 
876         // 16-bit real-mode code segment (reset vector).
877         let code_seg = Segment {
878             base: 0xffff0000,
879             limit_bytes: 0xffff,
880             selector: 0xf000,
881             type_: SEG_TYPE_CODE | SEG_TYPE_CODE_READABLE | SEG_TYPE_ACCESSED, // 11
882             present: 1,
883             s: SEG_S_CODE_OR_DATA,
884             ..Default::default()
885         };
886 
887         // 16-bit real-mode data segment.
888         let data_seg = Segment {
889             base: 0,
890             limit_bytes: 0xffff,
891             selector: 0,
892             type_: SEG_TYPE_DATA | SEG_TYPE_DATA_WRITABLE | SEG_TYPE_ACCESSED, // 3
893             present: 1,
894             s: SEG_S_CODE_OR_DATA,
895             ..Default::default()
896         };
897 
898         // 16-bit TSS segment.
899         let task_seg = Segment {
900             base: 0,
901             limit_bytes: 0xffff,
902             selector: 0,
903             type_: SEG_TYPE_CODE | SEG_TYPE_CODE_READABLE | SEG_TYPE_ACCESSED, // 11
904             present: 1,
905             s: SEG_S_SYSTEM,
906             ..Default::default()
907         };
908 
909         // Local descriptor table.
910         let ldt = Segment {
911             base: 0,
912             limit_bytes: 0xffff,
913             selector: 0,
914             type_: SEG_TYPE_DATA | SEG_TYPE_DATA_WRITABLE, // 2
915             present: 1,
916             s: SEG_S_SYSTEM,
917             ..Default::default()
918         };
919 
920         // Global descriptor table.
921         let gdt = DescriptorTable {
922             base: 0,
923             limit: 0xffff,
924         };
925 
926         // Interrupt descriptor table.
927         let idt = DescriptorTable {
928             base: 0,
929             limit: 0xffff,
930         };
931 
932         let cr0 = (1 << 4) // CR0.ET (reserved, always 1)
933                 | (1 << 30); // CR0.CD (cache disable)
934 
935         Sregs {
936             cs: code_seg,
937             ds: data_seg,
938             es: data_seg,
939             fs: data_seg,
940             gs: data_seg,
941             ss: data_seg,
942             tr: task_seg,
943             ldt,
944             gdt,
945             idt,
946             cr0,
947             cr2: 0,
948             cr3: 0,
949             cr4: 0,
950             cr8: 0,
951             efer: 0,
952         }
953     }
954 }
955 
956 /// x87 80-bit floating point value.
957 #[repr(C)]
958 #[derive(Copy, Clone, Debug, Default, Eq, PartialEq, Serialize, Deserialize)]
959 pub struct FpuReg {
960     /// 64-bit mantissa.
961     pub significand: u64,
962 
963     /// 15-bit biased exponent and sign bit.
964     pub sign_exp: u16,
965 }
966 
967 impl FpuReg {
968     /// Convert an array of 8x16-byte arrays to an array of 8 `FpuReg`.
969     ///
970     /// Ignores any data in the upper 6 bytes of each element; the values represent 80-bit FPU
971     /// registers, so the upper 48 bits are unused.
from_16byte_arrays(byte_arrays: &[[u8; 16]; 8]) -> [FpuReg; 8]972     pub fn from_16byte_arrays(byte_arrays: &[[u8; 16]; 8]) -> [FpuReg; 8] {
973         let mut regs = [FpuReg::default(); 8];
974         for (dst, src) in regs.iter_mut().zip(byte_arrays.iter()) {
975             let tbyte: [u8; 10] = src[0..10].try_into().unwrap();
976             *dst = FpuReg::from(tbyte);
977         }
978         regs
979     }
980 
981     /// Convert an array of 8 `FpuReg` into 8x16-byte arrays.
to_16byte_arrays(regs: &[FpuReg; 8]) -> [[u8; 16]; 8]982     pub fn to_16byte_arrays(regs: &[FpuReg; 8]) -> [[u8; 16]; 8] {
983         let mut byte_arrays = [[0u8; 16]; 8];
984         for (dst, src) in byte_arrays.iter_mut().zip(regs.iter()) {
985             *dst = (*src).into();
986         }
987         byte_arrays
988     }
989 }
990 
991 impl From<[u8; 10]> for FpuReg {
992     /// Construct a `FpuReg` from an 80-bit representation.
from(value: [u8; 10]) -> FpuReg993     fn from(value: [u8; 10]) -> FpuReg {
994         // These array sub-slices can't fail, but there's no (safe) way to express that in Rust
995         // without an `unwrap()`.
996         let significand_bytes = value[0..8].try_into().unwrap();
997         let significand = u64::from_le_bytes(significand_bytes);
998         let sign_exp_bytes = value[8..10].try_into().unwrap();
999         let sign_exp = u16::from_le_bytes(sign_exp_bytes);
1000         FpuReg {
1001             significand,
1002             sign_exp,
1003         }
1004     }
1005 }
1006 
1007 impl From<FpuReg> for [u8; 10] {
1008     /// Convert an `FpuReg` into its 80-bit "TBYTE" representation.
from(value: FpuReg) -> [u8; 10]1009     fn from(value: FpuReg) -> [u8; 10] {
1010         let mut bytes = [0u8; 10];
1011         bytes[0..8].copy_from_slice(&value.significand.to_le_bytes());
1012         bytes[8..10].copy_from_slice(&value.sign_exp.to_le_bytes());
1013         bytes
1014     }
1015 }
1016 
1017 impl From<FpuReg> for [u8; 16] {
1018     /// Convert an `FpuReg` into its 80-bit representation plus 6 unused upper bytes.
1019     /// This is a convenience function for converting to hypervisor types.
from(value: FpuReg) -> [u8; 16]1020     fn from(value: FpuReg) -> [u8; 16] {
1021         let mut bytes = [0u8; 16];
1022         bytes[0..8].copy_from_slice(&value.significand.to_le_bytes());
1023         bytes[8..10].copy_from_slice(&value.sign_exp.to_le_bytes());
1024         bytes
1025     }
1026 }
1027 
1028 /// State of a VCPU's floating point unit.
1029 #[repr(C)]
1030 #[derive(Debug, Copy, Clone, Serialize, Deserialize)]
1031 pub struct Fpu {
1032     pub fpr: [FpuReg; 8],
1033     pub fcw: u16,
1034     pub fsw: u16,
1035     pub ftwx: u8,
1036     pub last_opcode: u16,
1037     pub last_ip: u64,
1038     pub last_dp: u64,
1039     pub xmm: [[u8; 16usize]; 16usize],
1040     pub mxcsr: u32,
1041 }
1042 
1043 impl Default for Fpu {
default() -> Self1044     fn default() -> Self {
1045         Fpu {
1046             fpr: Default::default(),
1047             fcw: 0x37f, // Intel SDM Vol. 1, 13.6
1048             fsw: 0,
1049             ftwx: 0,
1050             last_opcode: 0,
1051             last_ip: 0,
1052             last_dp: 0,
1053             xmm: Default::default(),
1054             mxcsr: 0x1f80, // Intel SDM Vol. 1, 11.6.4
1055         }
1056     }
1057 }
1058 
1059 /// State of a VCPU's debug registers.
1060 #[repr(C)]
1061 #[derive(Debug, Default, Copy, Clone, Serialize, Deserialize)]
1062 pub struct DebugRegs {
1063     pub db: [u64; 4usize],
1064     pub dr6: u64,
1065     pub dr7: u64,
1066 }
1067 
1068 /// The hybrid type for intel hybrid CPU.
1069 #[derive(Clone, Copy, Debug, PartialEq, Eq, Serialize, Deserialize)]
1070 pub enum CpuHybridType {
1071     /// Intel Atom.
1072     Atom,
1073     /// Intel Core.
1074     Core,
1075 }
1076 
1077 /// State of the VCPU's x87 FPU, MMX, XMM, YMM registers.
1078 /// May contain more state depending on enabled extensions.
1079 #[derive(Clone, Debug, Serialize, Deserialize)]
1080 pub struct Xsave {
1081     data: Vec<u32>,
1082 
1083     // Actual length in bytes. May be smaller than data if a non-u32 multiple of bytes is
1084     // requested.
1085     len: usize,
1086 }
1087 
1088 impl Xsave {
1089     /// Create a new buffer to store Xsave data.
1090     ///
1091     /// # Argments
1092     /// * `len` size in bytes.
new(len: usize) -> Self1093     pub fn new(len: usize) -> Self {
1094         Xsave {
1095             data: vec![0; (len + 3) / 4],
1096             len,
1097         }
1098     }
1099 
as_ptr(&self) -> *const c_void1100     pub fn as_ptr(&self) -> *const c_void {
1101         self.data.as_ptr() as *const c_void
1102     }
1103 
as_mut_ptr(&mut self) -> *mut c_void1104     pub fn as_mut_ptr(&mut self) -> *mut c_void {
1105         self.data.as_mut_ptr() as *mut c_void
1106     }
1107 
1108     /// Length in bytes of the XSAVE data.
len(&self) -> usize1109     pub fn len(&self) -> usize {
1110         self.len
1111     }
1112 
1113     /// Returns true is length of XSAVE data is zero
is_empty(&self) -> bool1114     pub fn is_empty(&self) -> bool {
1115         self.len() == 0
1116     }
1117 }
1118