1 // Copyright 2020 The ChromiumOS Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 use std::arch::x86_64::CpuidResult;
6 #[cfg(any(unix, feature = "haxm", feature = "whpx"))]
7 use std::arch::x86_64::__cpuid;
8 use std::arch::x86_64::_rdtsc;
9 use std::collections::BTreeMap;
10 use std::collections::HashSet;
11
12 use anyhow::Context;
13 use base::custom_serde::deserialize_seq_to_arr;
14 use base::custom_serde::serialize_arr;
15 use base::error;
16 use base::warn;
17 use base::Result;
18 use bit_field::*;
19 use downcast_rs::impl_downcast;
20 use libc::c_void;
21 use serde::Deserialize;
22 use serde::Serialize;
23 use snapshot::AnySnapshot;
24 use vm_memory::GuestAddress;
25
26 use crate::Hypervisor;
27 use crate::IrqRoute;
28 use crate::IrqSource;
29 use crate::IrqSourceChip;
30 use crate::Vcpu;
31 use crate::Vm;
32
33 const MSR_F15H_PERF_CTL0: u32 = 0xc0010200;
34 const MSR_F15H_PERF_CTL1: u32 = 0xc0010202;
35 const MSR_F15H_PERF_CTL2: u32 = 0xc0010204;
36 const MSR_F15H_PERF_CTL3: u32 = 0xc0010206;
37 const MSR_F15H_PERF_CTL4: u32 = 0xc0010208;
38 const MSR_F15H_PERF_CTL5: u32 = 0xc001020a;
39 const MSR_F15H_PERF_CTR0: u32 = 0xc0010201;
40 const MSR_F15H_PERF_CTR1: u32 = 0xc0010203;
41 const MSR_F15H_PERF_CTR2: u32 = 0xc0010205;
42 const MSR_F15H_PERF_CTR3: u32 = 0xc0010207;
43 const MSR_F15H_PERF_CTR4: u32 = 0xc0010209;
44 const MSR_F15H_PERF_CTR5: u32 = 0xc001020b;
45 const MSR_IA32_PERF_CAPABILITIES: u32 = 0x00000345;
46
47 /// A trait for managing cpuids for an x86_64 hypervisor and for checking its capabilities.
48 pub trait HypervisorX86_64: Hypervisor {
49 /// Get the system supported CPUID values.
get_supported_cpuid(&self) -> Result<CpuId>50 fn get_supported_cpuid(&self) -> Result<CpuId>;
51
52 /// Gets the list of supported MSRs.
get_msr_index_list(&self) -> Result<Vec<u32>>53 fn get_msr_index_list(&self) -> Result<Vec<u32>>;
54 }
55
56 /// A wrapper for using a VM on x86_64 and getting/setting its state.
57 pub trait VmX86_64: Vm {
58 /// Gets the `HypervisorX86_64` that created this VM.
get_hypervisor(&self) -> &dyn HypervisorX86_6459 fn get_hypervisor(&self) -> &dyn HypervisorX86_64;
60
61 /// Create a Vcpu with the specified Vcpu ID.
create_vcpu(&self, id: usize) -> Result<Box<dyn VcpuX86_64>>62 fn create_vcpu(&self, id: usize) -> Result<Box<dyn VcpuX86_64>>;
63
64 /// Sets the address of the three-page region in the VM's address space.
set_tss_addr(&self, addr: GuestAddress) -> Result<()>65 fn set_tss_addr(&self, addr: GuestAddress) -> Result<()>;
66
67 /// Sets the address of a one-page region in the VM's address space.
set_identity_map_addr(&self, addr: GuestAddress) -> Result<()>68 fn set_identity_map_addr(&self, addr: GuestAddress) -> Result<()>;
69
70 /// Load pVM firmware for the VM, creating a memslot for it as needed.
71 ///
72 /// Only works on protected VMs (i.e. those with vm_type == KVM_X86_PKVM_PROTECTED_VM).
load_protected_vm_firmware(&mut self, fw_addr: GuestAddress, fw_max_size: u64) -> Result<()>73 fn load_protected_vm_firmware(&mut self, fw_addr: GuestAddress, fw_max_size: u64)
74 -> Result<()>;
75 }
76
77 /// A wrapper around creating and using a VCPU on x86_64.
78 pub trait VcpuX86_64: Vcpu {
79 /// Sets or clears the flag that requests the VCPU to exit when it becomes possible to inject
80 /// interrupts into the guest.
set_interrupt_window_requested(&self, requested: bool)81 fn set_interrupt_window_requested(&self, requested: bool);
82
83 /// Checks if we can inject an interrupt into the VCPU.
ready_for_interrupt(&self) -> bool84 fn ready_for_interrupt(&self) -> bool;
85
86 /// Injects interrupt vector `irq` into the VCPU.
87 ///
88 /// This function should only be called when [`Self::ready_for_interrupt`] returns true.
89 /// Otherwise the interrupt injection may fail or the next VCPU run may fail. However, if
90 /// [`Self::interrupt`] returns [`Ok`], the implementation must guarantee that the interrupt
91 /// isn't injected in an uninterruptible window (e.g. right after the mov ss instruction).
92 ///
93 /// The caller should avoid calling this function more than 1 time for one VMEXIT, because the
94 /// hypervisor may behave differently: some hypervisors(e.g. WHPX, KVM) will only try to inject
95 /// the last `irq` requested, while some other hypervisors(e.g. HAXM) may try to inject all
96 /// `irq`s requested.
interrupt(&self, irq: u8) -> Result<()>97 fn interrupt(&self, irq: u8) -> Result<()>;
98
99 /// Injects a non-maskable interrupt into the VCPU.
inject_nmi(&self) -> Result<()>100 fn inject_nmi(&self) -> Result<()>;
101
102 /// Gets the VCPU general purpose registers.
get_regs(&self) -> Result<Regs>103 fn get_regs(&self) -> Result<Regs>;
104
105 /// Sets the VCPU general purpose registers.
set_regs(&self, regs: &Regs) -> Result<()>106 fn set_regs(&self, regs: &Regs) -> Result<()>;
107
108 /// Gets the VCPU special registers.
get_sregs(&self) -> Result<Sregs>109 fn get_sregs(&self) -> Result<Sregs>;
110
111 /// Sets the VCPU special registers.
set_sregs(&self, sregs: &Sregs) -> Result<()>112 fn set_sregs(&self, sregs: &Sregs) -> Result<()>;
113
114 /// Gets the VCPU FPU registers.
get_fpu(&self) -> Result<Fpu>115 fn get_fpu(&self) -> Result<Fpu>;
116
117 /// Sets the VCPU FPU registers.
set_fpu(&self, fpu: &Fpu) -> Result<()>118 fn set_fpu(&self, fpu: &Fpu) -> Result<()>;
119
120 /// Gets the VCPU debug registers.
get_debugregs(&self) -> Result<DebugRegs>121 fn get_debugregs(&self) -> Result<DebugRegs>;
122
123 /// Sets the VCPU debug registers.
set_debugregs(&self, debugregs: &DebugRegs) -> Result<()>124 fn set_debugregs(&self, debugregs: &DebugRegs) -> Result<()>;
125
126 /// Gets the VCPU extended control registers.
get_xcrs(&self) -> Result<BTreeMap<u32, u64>>127 fn get_xcrs(&self) -> Result<BTreeMap<u32, u64>>;
128
129 /// Sets a VCPU extended control register.
set_xcr(&self, xcr: u32, value: u64) -> Result<()>130 fn set_xcr(&self, xcr: u32, value: u64) -> Result<()>;
131
132 /// Gets the VCPU x87 FPU, MMX, XMM, YMM and MXCSR registers.
get_xsave(&self) -> Result<Xsave>133 fn get_xsave(&self) -> Result<Xsave>;
134
135 /// Sets the VCPU x87 FPU, MMX, XMM, YMM and MXCSR registers.
set_xsave(&self, xsave: &Xsave) -> Result<()>136 fn set_xsave(&self, xsave: &Xsave) -> Result<()>;
137
138 /// Gets interrupt state (hypervisor specific) for this VCPU that must be
139 /// saved/restored for snapshotting.
get_interrupt_state(&self) -> Result<AnySnapshot>140 fn get_interrupt_state(&self) -> Result<AnySnapshot>;
141
142 /// Sets interrupt state (hypervisor specific) for this VCPU. Only used for
143 /// snapshotting.
set_interrupt_state(&self, data: AnySnapshot) -> Result<()>144 fn set_interrupt_state(&self, data: AnySnapshot) -> Result<()>;
145
146 /// Gets a single model-specific register's value.
get_msr(&self, msr_index: u32) -> Result<u64>147 fn get_msr(&self, msr_index: u32) -> Result<u64>;
148
149 /// Gets the model-specific registers. Returns all the MSRs for the VCPU.
get_all_msrs(&self) -> Result<BTreeMap<u32, u64>>150 fn get_all_msrs(&self) -> Result<BTreeMap<u32, u64>>;
151
152 /// Sets a single model-specific register's value.
set_msr(&self, msr_index: u32, value: u64) -> Result<()>153 fn set_msr(&self, msr_index: u32, value: u64) -> Result<()>;
154
155 /// Sets up the data returned by the CPUID instruction.
set_cpuid(&self, cpuid: &CpuId) -> Result<()>156 fn set_cpuid(&self, cpuid: &CpuId) -> Result<()>;
157
158 /// Sets up debug registers and configure vcpu for handling guest debug events.
set_guest_debug(&self, addrs: &[GuestAddress], enable_singlestep: bool) -> Result<()>159 fn set_guest_debug(&self, addrs: &[GuestAddress], enable_singlestep: bool) -> Result<()>;
160
161 /// This function should be called after `Vcpu::run` returns `VcpuExit::Cpuid`, and `entry`
162 /// should represent the result of emulating the CPUID instruction. The `handle_cpuid` function
163 /// will then set the appropriate registers on the vcpu.
handle_cpuid(&mut self, entry: &CpuIdEntry) -> Result<()>164 fn handle_cpuid(&mut self, entry: &CpuIdEntry) -> Result<()>;
165
166 /// Gets the guest->host TSC offset.
167 ///
168 /// The default implementation uses [`VcpuX86_64::get_msr()`] to read the guest TSC.
get_tsc_offset(&self) -> Result<u64>169 fn get_tsc_offset(&self) -> Result<u64> {
170 // SAFETY:
171 // Safe because _rdtsc takes no arguments
172 let host_before_tsc = unsafe { _rdtsc() };
173
174 // get guest TSC value from our hypervisor
175 let guest_tsc = self.get_msr(crate::MSR_IA32_TSC)?;
176
177 // SAFETY:
178 // Safe because _rdtsc takes no arguments
179 let host_after_tsc = unsafe { _rdtsc() };
180
181 // Average the before and after host tsc to get the best value
182 let host_tsc = ((host_before_tsc as u128 + host_after_tsc as u128) / 2) as u64;
183
184 Ok(guest_tsc.wrapping_sub(host_tsc))
185 }
186
187 /// Sets the guest->host TSC offset.
188 ///
189 /// The default implementation uses [`VcpuX86_64::set_tsc_value()`] to set the TSC value.
190 ///
191 /// It sets TSC_OFFSET (VMCS / CB field) by setting the TSC MSR to the current
192 /// host TSC value plus the desired offset. We rely on the fact that hypervisors
193 /// determine the value of TSC_OFFSET by computing TSC_OFFSET = `new_tsc_value - _rdtsc()` =
194 /// `_rdtsc() + offset - _rdtsc()` ~= `offset`. Note that the ~= is important: this is an
195 /// approximate operation, because the two _rdtsc() calls
196 /// are separated by at least a few ticks.
197 ///
198 /// Note: TSC_OFFSET, host TSC, guest TSC, and TSC MSR are all different
199 /// concepts.
200 /// * When a guest executes rdtsc, the value (guest TSC) returned is host_tsc * TSC_MULTIPLIER +
201 /// TSC_OFFSET + TSC_ADJUST.
202 /// * The TSC MSR is a special MSR that when written to by the host, will cause TSC_OFFSET to be
203 /// set accordingly by the hypervisor.
204 /// * When the guest *writes* to TSC MSR, it actually changes the TSC_ADJUST MSR *for the
205 /// guest*. Generally this is only happens if the guest is trying to re-zero or synchronize
206 /// TSCs.
set_tsc_offset(&self, offset: u64) -> Result<()>207 fn set_tsc_offset(&self, offset: u64) -> Result<()> {
208 // SAFETY: _rdtsc takes no arguments.
209 let host_tsc = unsafe { _rdtsc() };
210 self.set_tsc_value(host_tsc.wrapping_add(offset))
211 }
212
213 /// Sets the guest TSC exactly to the provided value.
214 ///
215 /// The default implementation sets the guest's TSC by writing the value to the MSR directly.
216 ///
217 /// See [`VcpuX86_64::set_tsc_offset()`] for an explanation of how this value is actually read
218 /// by the guest after being set.
set_tsc_value(&self, value: u64) -> Result<()>219 fn set_tsc_value(&self, value: u64) -> Result<()> {
220 self.set_msr(crate::MSR_IA32_TSC, value)
221 }
222
223 /// Some hypervisors require special handling to restore timekeeping when
224 /// a snapshot is restored. They are provided with a host TSC reference
225 /// moment, guaranteed to be the same across all Vcpus, and the Vcpu's TSC
226 /// offset at the moment it was snapshotted.
restore_timekeeping(&self, host_tsc_reference_moment: u64, tsc_offset: u64) -> Result<()>227 fn restore_timekeeping(&self, host_tsc_reference_moment: u64, tsc_offset: u64) -> Result<()>;
228
229 /// Snapshot vCPU state
snapshot(&self) -> anyhow::Result<VcpuSnapshot>230 fn snapshot(&self) -> anyhow::Result<VcpuSnapshot> {
231 Ok(VcpuSnapshot {
232 vcpu_id: self.id(),
233 regs: self.get_regs()?,
234 sregs: self.get_sregs()?,
235 debug_regs: self.get_debugregs()?,
236 xcrs: self.get_xcrs()?,
237 msrs: self.get_all_msrs()?,
238 xsave: self.get_xsave()?,
239 hypervisor_data: self.get_interrupt_state()?,
240 tsc_offset: self.get_tsc_offset()?,
241 })
242 }
243
restore( &mut self, snapshot: &VcpuSnapshot, host_tsc_reference_moment: u64, ) -> anyhow::Result<()>244 fn restore(
245 &mut self,
246 snapshot: &VcpuSnapshot,
247 host_tsc_reference_moment: u64,
248 ) -> anyhow::Result<()> {
249 // List of MSRs that may fail to restore due to lack of support in the host kernel.
250 // Some hosts are may be running older kernels which do not support all MSRs, but
251 // get_all_msrs will still fetch the MSRs supported by the CPU. Trying to set those MSRs
252 // will result in failures, so they will throw a warning instead.
253 let msr_allowlist = HashSet::from([
254 MSR_F15H_PERF_CTL0,
255 MSR_F15H_PERF_CTL1,
256 MSR_F15H_PERF_CTL2,
257 MSR_F15H_PERF_CTL3,
258 MSR_F15H_PERF_CTL4,
259 MSR_F15H_PERF_CTL5,
260 MSR_F15H_PERF_CTR0,
261 MSR_F15H_PERF_CTR1,
262 MSR_F15H_PERF_CTR2,
263 MSR_F15H_PERF_CTR3,
264 MSR_F15H_PERF_CTR4,
265 MSR_F15H_PERF_CTR5,
266 MSR_IA32_PERF_CAPABILITIES,
267 ]);
268 assert_eq!(snapshot.vcpu_id, self.id());
269 self.set_regs(&snapshot.regs)?;
270 self.set_sregs(&snapshot.sregs)?;
271 self.set_debugregs(&snapshot.debug_regs)?;
272 for (xcr_index, value) in &snapshot.xcrs {
273 self.set_xcr(*xcr_index, *value)?;
274 }
275
276 for (msr_index, value) in snapshot.msrs.iter() {
277 if self.get_msr(*msr_index) == Ok(*value) {
278 continue; // no need to set MSR since the values are the same.
279 }
280 if let Err(e) = self.set_msr(*msr_index, *value) {
281 if msr_allowlist.contains(msr_index) {
282 warn!(
283 "Failed to set MSR. MSR might not be supported in this kernel. Err: {}",
284 e
285 );
286 } else {
287 return Err(e).context(
288 "Failed to set MSR. MSR might not be supported by the CPU or by the kernel,
289 and was not allow-listed.",
290 );
291 }
292 };
293 }
294 self.set_xsave(&snapshot.xsave)?;
295 self.set_interrupt_state(snapshot.hypervisor_data.clone())?;
296 self.restore_timekeeping(host_tsc_reference_moment, snapshot.tsc_offset)?;
297 Ok(())
298 }
299 }
300
301 /// x86 specific vCPU snapshot.
302 #[derive(Clone, Debug, Serialize, Deserialize)]
303 pub struct VcpuSnapshot {
304 pub vcpu_id: usize,
305 regs: Regs,
306 sregs: Sregs,
307 debug_regs: DebugRegs,
308 xcrs: BTreeMap<u32, u64>,
309 msrs: BTreeMap<u32, u64>,
310 xsave: Xsave,
311 hypervisor_data: AnySnapshot,
312 tsc_offset: u64,
313 }
314
315 impl_downcast!(VcpuX86_64);
316
317 // TSC MSR
318 pub const MSR_IA32_TSC: u32 = 0x00000010;
319
320 /// Gets host cpu max physical address bits.
321 #[cfg(any(unix, feature = "haxm", feature = "whpx"))]
host_phys_addr_bits() -> u8322 pub(crate) fn host_phys_addr_bits() -> u8 {
323 // SAFETY: trivially safe
324 let highest_ext_function = unsafe { __cpuid(0x80000000) };
325 if highest_ext_function.eax >= 0x80000008 {
326 // SAFETY: trivially safe
327 let addr_size = unsafe { __cpuid(0x80000008) };
328 // Low 8 bits of 0x80000008 leaf: host physical address size in bits.
329 addr_size.eax as u8
330 } else {
331 36
332 }
333 }
334
335 /// Initial state for x86_64 VCPUs.
336 #[derive(Clone, Default)]
337 pub struct VcpuInitX86_64 {
338 /// General-purpose registers.
339 pub regs: Regs,
340
341 /// Special registers.
342 pub sregs: Sregs,
343
344 /// Floating-point registers.
345 pub fpu: Fpu,
346
347 /// Machine-specific registers.
348 pub msrs: BTreeMap<u32, u64>,
349 }
350
351 /// Hold the CPU feature configurations that are needed to setup a vCPU.
352 #[derive(Clone, Debug, PartialEq, Eq)]
353 pub struct CpuConfigX86_64 {
354 /// whether to force using a calibrated TSC leaf (0x15).
355 pub force_calibrated_tsc_leaf: bool,
356
357 /// whether enabling host cpu topology.
358 pub host_cpu_topology: bool,
359
360 /// whether expose HWP feature to the guest.
361 pub enable_hwp: bool,
362
363 /// Wheter diabling SMT (Simultaneous Multithreading).
364 pub no_smt: bool,
365
366 /// whether enabling ITMT scheduler
367 pub itmt: bool,
368
369 /// whether setting hybrid CPU type
370 pub hybrid_type: Option<CpuHybridType>,
371 }
372
373 impl CpuConfigX86_64 {
new( force_calibrated_tsc_leaf: bool, host_cpu_topology: bool, enable_hwp: bool, no_smt: bool, itmt: bool, hybrid_type: Option<CpuHybridType>, ) -> Self374 pub fn new(
375 force_calibrated_tsc_leaf: bool,
376 host_cpu_topology: bool,
377 enable_hwp: bool,
378 no_smt: bool,
379 itmt: bool,
380 hybrid_type: Option<CpuHybridType>,
381 ) -> Self {
382 CpuConfigX86_64 {
383 force_calibrated_tsc_leaf,
384 host_cpu_topology,
385 enable_hwp,
386 no_smt,
387 itmt,
388 hybrid_type,
389 }
390 }
391 }
392
393 /// A CpuId Entry contains supported feature information for the given processor.
394 /// This can be modified by the hypervisor to pass additional information to the guest kernel
395 /// about the hypervisor or vm. Information is returned in the eax, ebx, ecx and edx registers
396 /// by the cpu for a given function and index/subfunction (passed into the cpu via the eax and ecx
397 /// register respectively).
398 #[repr(C)]
399 #[derive(Clone, Copy, Debug, PartialEq, Eq)]
400 pub struct CpuIdEntry {
401 pub function: u32,
402 pub index: u32,
403 // flags is needed for KVM. We store it on CpuIdEntry to preserve the flags across
404 // get_supported_cpuids() -> kvm_cpuid2 -> CpuId -> kvm_cpuid2 -> set_cpuid().
405 pub flags: u32,
406 pub cpuid: CpuidResult,
407 }
408
409 /// A container for the list of cpu id entries for the hypervisor and underlying cpu.
410 pub struct CpuId {
411 pub cpu_id_entries: Vec<CpuIdEntry>,
412 }
413
414 impl CpuId {
415 /// Constructs a new CpuId, with space allocated for `initial_capacity` CpuIdEntries.
new(initial_capacity: usize) -> Self416 pub fn new(initial_capacity: usize) -> Self {
417 CpuId {
418 cpu_id_entries: Vec::with_capacity(initial_capacity),
419 }
420 }
421 }
422
423 #[bitfield]
424 #[derive(Clone, Copy, Debug, PartialEq, Eq)]
425 pub enum DestinationMode {
426 Physical = 0,
427 Logical = 1,
428 }
429
430 #[bitfield]
431 #[derive(Clone, Copy, Debug, PartialEq, Eq)]
432 pub enum TriggerMode {
433 Edge = 0,
434 Level = 1,
435 }
436
437 #[bitfield]
438 #[derive(Debug, Clone, Copy, PartialEq, Eq)]
439 pub enum DeliveryMode {
440 Fixed = 0b000,
441 Lowest = 0b001,
442 SMI = 0b010, // System management interrupt
443 RemoteRead = 0b011, // This is no longer supported by intel.
444 NMI = 0b100, // Non maskable interrupt
445 Init = 0b101,
446 Startup = 0b110,
447 External = 0b111,
448 }
449
450 // These MSI structures are for Intel's implementation of MSI. The PCI spec defines most of MSI,
451 // but the Intel spec defines the format of messages for raising interrupts. The PCI spec defines
452 // three u32s -- the address, address_high, and data -- but Intel only makes use of the address and
453 // data. The Intel portion of the specification is in Volume 3 section 10.11.
454 #[bitfield]
455 #[derive(Clone, Copy, PartialEq, Eq)]
456 pub struct MsiAddressMessage {
457 pub reserved: BitField2,
458 #[bits = 1]
459 pub destination_mode: DestinationMode,
460 pub redirection_hint: BitField1,
461 pub reserved_2: BitField8,
462 pub destination_id: BitField8,
463 // According to Intel's implementation of MSI, these bits must always be 0xfee.
464 pub always_0xfee: BitField12,
465 }
466
467 #[bitfield]
468 #[derive(Clone, Copy, PartialEq, Eq)]
469 pub struct MsiDataMessage {
470 pub vector: BitField8,
471 #[bits = 3]
472 pub delivery_mode: DeliveryMode,
473 pub reserved: BitField3,
474 #[bits = 1]
475 pub level: Level,
476 #[bits = 1]
477 pub trigger: TriggerMode,
478 pub reserved2: BitField16,
479 }
480
481 #[bitfield]
482 #[derive(Debug, Clone, Copy, PartialEq, Eq)]
483 pub enum DeliveryStatus {
484 Idle = 0,
485 Pending = 1,
486 }
487
488 /// The level of a level-triggered interrupt: asserted or deasserted.
489 #[bitfield]
490 #[derive(Debug, Clone, Copy, PartialEq, Eq)]
491 pub enum Level {
492 Deassert = 0,
493 Assert = 1,
494 }
495
496 /// Represents a IOAPIC redirection table entry.
497 #[bitfield]
498 #[derive(Clone, Copy, Default, PartialEq, Eq, Serialize, Deserialize)]
499 pub struct IoapicRedirectionTableEntry {
500 vector: BitField8,
501 #[bits = 3]
502 delivery_mode: DeliveryMode,
503 #[bits = 1]
504 dest_mode: DestinationMode,
505 #[bits = 1]
506 delivery_status: DeliveryStatus,
507 polarity: BitField1,
508 remote_irr: bool,
509 #[bits = 1]
510 trigger_mode: TriggerMode,
511 interrupt_mask: bool, // true iff interrupts are masked.
512 reserved: BitField39,
513 dest_id: BitField8,
514 }
515
516 /// Number of pins on the standard KVM/IOAPIC.
517 pub const NUM_IOAPIC_PINS: usize = 24;
518
519 /// Represents the state of the IOAPIC.
520 #[repr(C)]
521 #[derive(Clone, Copy, Debug, PartialEq, Eq, Serialize, Deserialize)]
522 pub struct IoapicState {
523 /// base_address is the memory base address for this IOAPIC. It cannot be changed.
524 pub base_address: u64,
525 /// ioregsel register. Used for selecting which entry of the redirect table to read/write.
526 pub ioregsel: u8,
527 /// ioapicid register. Bits 24 - 27 contain the APIC ID for this device.
528 pub ioapicid: u32,
529 /// current_interrupt_level_bitmap represents a bitmap of the state of all of the irq lines
530 pub current_interrupt_level_bitmap: u32,
531 /// redirect_table contains the irq settings for each irq line
532 #[serde(
533 serialize_with = "serialize_arr",
534 deserialize_with = "deserialize_seq_to_arr"
535 )]
536 pub redirect_table: [IoapicRedirectionTableEntry; NUM_IOAPIC_PINS],
537 }
538
539 impl Default for IoapicState {
default() -> IoapicState540 fn default() -> IoapicState {
541 // SAFETY: trivially safe
542 unsafe { std::mem::zeroed() }
543 }
544 }
545
546 #[repr(C)]
547 #[derive(Debug, Clone, Copy, PartialEq, Eq)]
548 pub enum PicSelect {
549 Primary = 0,
550 Secondary = 1,
551 }
552
553 #[repr(C)]
554 #[derive(enumn::N, Debug, Clone, Copy, Default, PartialEq, Eq, Serialize, Deserialize)]
555 pub enum PicInitState {
556 #[default]
557 Icw1 = 0,
558 Icw2 = 1,
559 Icw3 = 2,
560 Icw4 = 3,
561 }
562
563 /// Convenience implementation for converting from a u8
564 impl From<u8> for PicInitState {
from(item: u8) -> Self565 fn from(item: u8) -> Self {
566 PicInitState::n(item).unwrap_or_else(|| {
567 error!("Invalid PicInitState {}, setting to 0", item);
568 PicInitState::Icw1
569 })
570 }
571 }
572
573 /// Represents the state of the PIC.
574 #[repr(C)]
575 #[derive(Clone, Copy, Default, Debug, PartialEq, Eq, Serialize, Deserialize)]
576 pub struct PicState {
577 /// Edge detection.
578 pub last_irr: u8,
579 /// Interrupt Request Register.
580 pub irr: u8,
581 /// Interrupt Mask Register.
582 pub imr: u8,
583 /// Interrupt Service Register.
584 pub isr: u8,
585 /// Highest priority, for priority rotation.
586 pub priority_add: u8,
587 pub irq_base: u8,
588 pub read_reg_select: bool,
589 pub poll: bool,
590 pub special_mask: bool,
591 pub init_state: PicInitState,
592 pub auto_eoi: bool,
593 pub rotate_on_auto_eoi: bool,
594 pub special_fully_nested_mode: bool,
595 /// PIC takes either 3 or 4 bytes of initialization command word during
596 /// initialization. use_4_byte_icw is true if 4 bytes of ICW are needed.
597 pub use_4_byte_icw: bool,
598 /// "Edge/Level Control Registers", for edge trigger selection.
599 /// When a particular bit is set, the corresponding IRQ is in level-triggered mode. Otherwise
600 /// it is in edge-triggered mode.
601 pub elcr: u8,
602 pub elcr_mask: u8,
603 }
604
605 /// The LapicState represents the state of an x86 CPU's Local APIC.
606 /// The Local APIC consists of 64 128-bit registers, but only the first 32-bits of each register
607 /// can be used, so this structure only stores the first 32-bits of each register.
608 #[repr(C)]
609 #[derive(Clone, Copy, Serialize, Deserialize)]
610 pub struct LapicState {
611 #[serde(
612 serialize_with = "serialize_arr",
613 deserialize_with = "deserialize_seq_to_arr"
614 )]
615 pub regs: [LapicRegister; 64],
616 }
617
618 pub type LapicRegister = u32;
619
620 // rust arrays longer than 32 need custom implementations of Debug
621 impl std::fmt::Debug for LapicState {
fmt(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result622 fn fmt(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
623 self.regs[..].fmt(formatter)
624 }
625 }
626
627 // rust arrays longer than 32 need custom implementations of PartialEq
628 impl PartialEq for LapicState {
eq(&self, other: &LapicState) -> bool629 fn eq(&self, other: &LapicState) -> bool {
630 self.regs[..] == other.regs[..]
631 }
632 }
633
634 // Lapic equality is reflexive, so we impl Eq
635 impl Eq for LapicState {}
636
637 /// The PitState represents the state of the PIT (aka the Programmable Interval Timer).
638 /// The state is simply the state of it's three channels.
639 #[repr(C)]
640 #[derive(Clone, Copy, Debug, PartialEq, Eq, Serialize, Deserialize)]
641 pub struct PitState {
642 pub channels: [PitChannelState; 3],
643 /// Hypervisor-specific flags for setting the pit state.
644 pub flags: u32,
645 }
646
647 /// The PitRWMode enum represents the access mode of a PIT channel.
648 /// Reads and writes to the Pit happen over Port-mapped I/O, which happens one byte at a time,
649 /// but the count values and latch values are two bytes. So the access mode controls which of the
650 /// two bytes will be read when.
651 #[repr(C)]
652 #[derive(enumn::N, Clone, Copy, Debug, PartialEq, Eq, Serialize, Deserialize)]
653 pub enum PitRWMode {
654 /// None mode means that no access mode has been set.
655 None = 0,
656 /// Least mode means all reads/writes will read/write the least significant byte.
657 Least = 1,
658 /// Most mode means all reads/writes will read/write the most significant byte.
659 Most = 2,
660 /// Both mode means first the least significant byte will be read/written, then the
661 /// next read/write will read/write the most significant byte.
662 Both = 3,
663 }
664
665 /// Convenience implementation for converting from a u8
666 impl From<u8> for PitRWMode {
from(item: u8) -> Self667 fn from(item: u8) -> Self {
668 PitRWMode::n(item).unwrap_or_else(|| {
669 error!("Invalid PitRWMode value {}, setting to 0", item);
670 PitRWMode::None
671 })
672 }
673 }
674
675 /// The PitRWState enum represents the state of reading to or writing from a channel.
676 /// This is related to the PitRWMode, it mainly gives more detail about the state of the channel
677 /// with respect to PitRWMode::Both.
678 #[repr(C)]
679 #[derive(enumn::N, Clone, Copy, Debug, PartialEq, Eq, Serialize, Deserialize)]
680 pub enum PitRWState {
681 /// None mode means that no access mode has been set.
682 None = 0,
683 /// LSB means that the channel is in PitRWMode::Least access mode.
684 LSB = 1,
685 /// MSB means that the channel is in PitRWMode::Most access mode.
686 MSB = 2,
687 /// Word0 means that the channel is in PitRWMode::Both mode, and the least sginificant byte
688 /// has not been read/written yet.
689 Word0 = 3,
690 /// Word1 means that the channel is in PitRWMode::Both mode and the least significant byte
691 /// has already been read/written, and the next byte to be read/written will be the most
692 /// significant byte.
693 Word1 = 4,
694 }
695
696 /// Convenience implementation for converting from a u8
697 impl From<u8> for PitRWState {
from(item: u8) -> Self698 fn from(item: u8) -> Self {
699 PitRWState::n(item).unwrap_or_else(|| {
700 error!("Invalid PitRWState value {}, setting to 0", item);
701 PitRWState::None
702 })
703 }
704 }
705
706 /// The PitChannelState represents the state of one of the PIT's three counters.
707 #[repr(C)]
708 #[derive(Clone, Copy, Debug, PartialEq, Eq, Serialize, Deserialize)]
709 pub struct PitChannelState {
710 /// The starting value for the counter.
711 pub count: u32,
712 /// Stores the channel count from the last time the count was latched.
713 pub latched_count: u16,
714 /// Indicates the PitRWState state of reading the latch value.
715 pub count_latched: PitRWState,
716 /// Indicates whether ReadBack status has been latched.
717 pub status_latched: bool,
718 /// Stores the channel status from the last time the status was latched. The status contains
719 /// information about the access mode of this channel, but changing those bits in the status
720 /// will not change the behavior of the pit.
721 pub status: u8,
722 /// Indicates the PitRWState state of reading the counter.
723 pub read_state: PitRWState,
724 /// Indicates the PitRWState state of writing the counter.
725 pub write_state: PitRWState,
726 /// Stores the value with which the counter was initialized. Counters are 16-
727 /// bit values with an effective range of 1-65536 (65536 represented by 0).
728 pub reload_value: u16,
729 /// The command access mode of this channel.
730 pub rw_mode: PitRWMode,
731 /// The operation mode of this channel.
732 pub mode: u8,
733 /// Whether or not we are in bcd mode. Not supported by KVM or crosvm's PIT implementation.
734 pub bcd: bool,
735 /// Value of the gate input pin. This only applies to channel 2.
736 pub gate: bool,
737 /// Nanosecond timestamp of when the count value was loaded.
738 pub count_load_time: u64,
739 }
740
741 // Convenience constructors for IrqRoutes
742 impl IrqRoute {
ioapic_irq_route(irq_num: u32) -> IrqRoute743 pub fn ioapic_irq_route(irq_num: u32) -> IrqRoute {
744 IrqRoute {
745 gsi: irq_num,
746 source: IrqSource::Irqchip {
747 chip: IrqSourceChip::Ioapic,
748 pin: irq_num,
749 },
750 }
751 }
752
pic_irq_route(id: IrqSourceChip, irq_num: u32) -> IrqRoute753 pub fn pic_irq_route(id: IrqSourceChip, irq_num: u32) -> IrqRoute {
754 IrqRoute {
755 gsi: irq_num,
756 source: IrqSource::Irqchip {
757 chip: id,
758 pin: irq_num % 8,
759 },
760 }
761 }
762 }
763
764 /// State of a VCPU's general purpose registers.
765 #[repr(C)]
766 #[derive(Debug, Copy, Clone, Serialize, Deserialize)]
767 pub struct Regs {
768 pub rax: u64,
769 pub rbx: u64,
770 pub rcx: u64,
771 pub rdx: u64,
772 pub rsi: u64,
773 pub rdi: u64,
774 pub rsp: u64,
775 pub rbp: u64,
776 pub r8: u64,
777 pub r9: u64,
778 pub r10: u64,
779 pub r11: u64,
780 pub r12: u64,
781 pub r13: u64,
782 pub r14: u64,
783 pub r15: u64,
784 pub rip: u64,
785 pub rflags: u64,
786 }
787
788 impl Default for Regs {
default() -> Self789 fn default() -> Self {
790 Regs {
791 rax: 0,
792 rbx: 0,
793 rcx: 0,
794 rdx: 0,
795 rsi: 0,
796 rdi: 0,
797 rsp: 0,
798 rbp: 0,
799 r8: 0,
800 r9: 0,
801 r10: 0,
802 r11: 0,
803 r12: 0,
804 r13: 0,
805 r14: 0,
806 r15: 0,
807 rip: 0xfff0, // Reset vector.
808 rflags: 0x2, // Bit 1 (0x2) is always 1.
809 }
810 }
811 }
812
813 /// State of a memory segment.
814 #[repr(C)]
815 #[derive(Debug, Default, Copy, Clone, Serialize, Deserialize, PartialEq, Eq)]
816 pub struct Segment {
817 pub base: u64,
818 /// Limit of the segment - always in bytes, regardless of granularity (`g`) field.
819 pub limit_bytes: u32,
820 pub selector: u16,
821 pub type_: u8,
822 pub present: u8,
823 pub dpl: u8,
824 pub db: u8,
825 pub s: u8,
826 pub l: u8,
827 pub g: u8,
828 pub avl: u8,
829 }
830
831 /// State of a global descriptor table or interrupt descriptor table.
832 #[repr(C)]
833 #[derive(Debug, Default, Copy, Clone, Serialize, Deserialize)]
834 pub struct DescriptorTable {
835 pub base: u64,
836 pub limit: u16,
837 }
838
839 /// State of a VCPU's special registers.
840 #[repr(C)]
841 #[derive(Debug, Copy, Clone, Serialize, Deserialize)]
842 pub struct Sregs {
843 pub cs: Segment,
844 pub ds: Segment,
845 pub es: Segment,
846 pub fs: Segment,
847 pub gs: Segment,
848 pub ss: Segment,
849 pub tr: Segment,
850 pub ldt: Segment,
851 pub gdt: DescriptorTable,
852 pub idt: DescriptorTable,
853 pub cr0: u64,
854 pub cr2: u64,
855 pub cr3: u64,
856 pub cr4: u64,
857 pub cr8: u64,
858 pub efer: u64,
859 }
860
861 impl Default for Sregs {
default() -> Self862 fn default() -> Self {
863 // Intel SDM Vol. 3A, 3.4.5.1 ("Code- and Data-Segment Descriptor Types")
864 const SEG_TYPE_DATA: u8 = 0b0000;
865 const SEG_TYPE_DATA_WRITABLE: u8 = 0b0010;
866
867 const SEG_TYPE_CODE: u8 = 0b1000;
868 const SEG_TYPE_CODE_READABLE: u8 = 0b0010;
869
870 const SEG_TYPE_ACCESSED: u8 = 0b0001;
871
872 // Intel SDM Vol. 3A, 3.4.5 ("Segment Descriptors")
873 const SEG_S_SYSTEM: u8 = 0; // System segment.
874 const SEG_S_CODE_OR_DATA: u8 = 1; // Data/code segment.
875
876 // 16-bit real-mode code segment (reset vector).
877 let code_seg = Segment {
878 base: 0xffff0000,
879 limit_bytes: 0xffff,
880 selector: 0xf000,
881 type_: SEG_TYPE_CODE | SEG_TYPE_CODE_READABLE | SEG_TYPE_ACCESSED, // 11
882 present: 1,
883 s: SEG_S_CODE_OR_DATA,
884 ..Default::default()
885 };
886
887 // 16-bit real-mode data segment.
888 let data_seg = Segment {
889 base: 0,
890 limit_bytes: 0xffff,
891 selector: 0,
892 type_: SEG_TYPE_DATA | SEG_TYPE_DATA_WRITABLE | SEG_TYPE_ACCESSED, // 3
893 present: 1,
894 s: SEG_S_CODE_OR_DATA,
895 ..Default::default()
896 };
897
898 // 16-bit TSS segment.
899 let task_seg = Segment {
900 base: 0,
901 limit_bytes: 0xffff,
902 selector: 0,
903 type_: SEG_TYPE_CODE | SEG_TYPE_CODE_READABLE | SEG_TYPE_ACCESSED, // 11
904 present: 1,
905 s: SEG_S_SYSTEM,
906 ..Default::default()
907 };
908
909 // Local descriptor table.
910 let ldt = Segment {
911 base: 0,
912 limit_bytes: 0xffff,
913 selector: 0,
914 type_: SEG_TYPE_DATA | SEG_TYPE_DATA_WRITABLE, // 2
915 present: 1,
916 s: SEG_S_SYSTEM,
917 ..Default::default()
918 };
919
920 // Global descriptor table.
921 let gdt = DescriptorTable {
922 base: 0,
923 limit: 0xffff,
924 };
925
926 // Interrupt descriptor table.
927 let idt = DescriptorTable {
928 base: 0,
929 limit: 0xffff,
930 };
931
932 let cr0 = (1 << 4) // CR0.ET (reserved, always 1)
933 | (1 << 30); // CR0.CD (cache disable)
934
935 Sregs {
936 cs: code_seg,
937 ds: data_seg,
938 es: data_seg,
939 fs: data_seg,
940 gs: data_seg,
941 ss: data_seg,
942 tr: task_seg,
943 ldt,
944 gdt,
945 idt,
946 cr0,
947 cr2: 0,
948 cr3: 0,
949 cr4: 0,
950 cr8: 0,
951 efer: 0,
952 }
953 }
954 }
955
956 /// x87 80-bit floating point value.
957 #[repr(C)]
958 #[derive(Copy, Clone, Debug, Default, Eq, PartialEq, Serialize, Deserialize)]
959 pub struct FpuReg {
960 /// 64-bit mantissa.
961 pub significand: u64,
962
963 /// 15-bit biased exponent and sign bit.
964 pub sign_exp: u16,
965 }
966
967 impl FpuReg {
968 /// Convert an array of 8x16-byte arrays to an array of 8 `FpuReg`.
969 ///
970 /// Ignores any data in the upper 6 bytes of each element; the values represent 80-bit FPU
971 /// registers, so the upper 48 bits are unused.
from_16byte_arrays(byte_arrays: &[[u8; 16]; 8]) -> [FpuReg; 8]972 pub fn from_16byte_arrays(byte_arrays: &[[u8; 16]; 8]) -> [FpuReg; 8] {
973 let mut regs = [FpuReg::default(); 8];
974 for (dst, src) in regs.iter_mut().zip(byte_arrays.iter()) {
975 let tbyte: [u8; 10] = src[0..10].try_into().unwrap();
976 *dst = FpuReg::from(tbyte);
977 }
978 regs
979 }
980
981 /// Convert an array of 8 `FpuReg` into 8x16-byte arrays.
to_16byte_arrays(regs: &[FpuReg; 8]) -> [[u8; 16]; 8]982 pub fn to_16byte_arrays(regs: &[FpuReg; 8]) -> [[u8; 16]; 8] {
983 let mut byte_arrays = [[0u8; 16]; 8];
984 for (dst, src) in byte_arrays.iter_mut().zip(regs.iter()) {
985 *dst = (*src).into();
986 }
987 byte_arrays
988 }
989 }
990
991 impl From<[u8; 10]> for FpuReg {
992 /// Construct a `FpuReg` from an 80-bit representation.
from(value: [u8; 10]) -> FpuReg993 fn from(value: [u8; 10]) -> FpuReg {
994 // These array sub-slices can't fail, but there's no (safe) way to express that in Rust
995 // without an `unwrap()`.
996 let significand_bytes = value[0..8].try_into().unwrap();
997 let significand = u64::from_le_bytes(significand_bytes);
998 let sign_exp_bytes = value[8..10].try_into().unwrap();
999 let sign_exp = u16::from_le_bytes(sign_exp_bytes);
1000 FpuReg {
1001 significand,
1002 sign_exp,
1003 }
1004 }
1005 }
1006
1007 impl From<FpuReg> for [u8; 10] {
1008 /// Convert an `FpuReg` into its 80-bit "TBYTE" representation.
from(value: FpuReg) -> [u8; 10]1009 fn from(value: FpuReg) -> [u8; 10] {
1010 let mut bytes = [0u8; 10];
1011 bytes[0..8].copy_from_slice(&value.significand.to_le_bytes());
1012 bytes[8..10].copy_from_slice(&value.sign_exp.to_le_bytes());
1013 bytes
1014 }
1015 }
1016
1017 impl From<FpuReg> for [u8; 16] {
1018 /// Convert an `FpuReg` into its 80-bit representation plus 6 unused upper bytes.
1019 /// This is a convenience function for converting to hypervisor types.
from(value: FpuReg) -> [u8; 16]1020 fn from(value: FpuReg) -> [u8; 16] {
1021 let mut bytes = [0u8; 16];
1022 bytes[0..8].copy_from_slice(&value.significand.to_le_bytes());
1023 bytes[8..10].copy_from_slice(&value.sign_exp.to_le_bytes());
1024 bytes
1025 }
1026 }
1027
1028 /// State of a VCPU's floating point unit.
1029 #[repr(C)]
1030 #[derive(Debug, Copy, Clone, Serialize, Deserialize)]
1031 pub struct Fpu {
1032 pub fpr: [FpuReg; 8],
1033 pub fcw: u16,
1034 pub fsw: u16,
1035 pub ftwx: u8,
1036 pub last_opcode: u16,
1037 pub last_ip: u64,
1038 pub last_dp: u64,
1039 pub xmm: [[u8; 16usize]; 16usize],
1040 pub mxcsr: u32,
1041 }
1042
1043 impl Default for Fpu {
default() -> Self1044 fn default() -> Self {
1045 Fpu {
1046 fpr: Default::default(),
1047 fcw: 0x37f, // Intel SDM Vol. 1, 13.6
1048 fsw: 0,
1049 ftwx: 0,
1050 last_opcode: 0,
1051 last_ip: 0,
1052 last_dp: 0,
1053 xmm: Default::default(),
1054 mxcsr: 0x1f80, // Intel SDM Vol. 1, 11.6.4
1055 }
1056 }
1057 }
1058
1059 /// State of a VCPU's debug registers.
1060 #[repr(C)]
1061 #[derive(Debug, Default, Copy, Clone, Serialize, Deserialize)]
1062 pub struct DebugRegs {
1063 pub db: [u64; 4usize],
1064 pub dr6: u64,
1065 pub dr7: u64,
1066 }
1067
1068 /// The hybrid type for intel hybrid CPU.
1069 #[derive(Clone, Copy, Debug, PartialEq, Eq, Serialize, Deserialize)]
1070 pub enum CpuHybridType {
1071 /// Intel Atom.
1072 Atom,
1073 /// Intel Core.
1074 Core,
1075 }
1076
1077 /// State of the VCPU's x87 FPU, MMX, XMM, YMM registers.
1078 /// May contain more state depending on enabled extensions.
1079 #[derive(Clone, Debug, Serialize, Deserialize)]
1080 pub struct Xsave {
1081 data: Vec<u32>,
1082
1083 // Actual length in bytes. May be smaller than data if a non-u32 multiple of bytes is
1084 // requested.
1085 len: usize,
1086 }
1087
1088 impl Xsave {
1089 /// Create a new buffer to store Xsave data.
1090 ///
1091 /// # Argments
1092 /// * `len` size in bytes.
new(len: usize) -> Self1093 pub fn new(len: usize) -> Self {
1094 Xsave {
1095 data: vec![0; (len + 3) / 4],
1096 len,
1097 }
1098 }
1099
as_ptr(&self) -> *const c_void1100 pub fn as_ptr(&self) -> *const c_void {
1101 self.data.as_ptr() as *const c_void
1102 }
1103
as_mut_ptr(&mut self) -> *mut c_void1104 pub fn as_mut_ptr(&mut self) -> *mut c_void {
1105 self.data.as_mut_ptr() as *mut c_void
1106 }
1107
1108 /// Length in bytes of the XSAVE data.
len(&self) -> usize1109 pub fn len(&self) -> usize {
1110 self.len
1111 }
1112
1113 /// Returns true is length of XSAVE data is zero
is_empty(&self) -> bool1114 pub fn is_empty(&self) -> bool {
1115 self.len() == 0
1116 }
1117 }
1118