• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2017 The ChromiumOS Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 //! x86 architecture support.
6 
7 #![cfg(any(target_arch = "x86", target_arch = "x86_64"))]
8 
9 mod fdt;
10 
11 const SETUP_DTB: u32 = 2;
12 const SETUP_RNG_SEED: u32 = 9;
13 
14 #[allow(dead_code)]
15 #[allow(non_upper_case_globals)]
16 #[allow(non_camel_case_types)]
17 #[allow(non_snake_case)]
18 pub mod bootparam;
19 
20 #[allow(dead_code)]
21 #[allow(non_upper_case_globals)]
22 mod msr_index;
23 
24 #[allow(dead_code)]
25 #[allow(non_upper_case_globals)]
26 #[allow(non_camel_case_types)]
27 #[allow(clippy::all)]
28 mod mpspec;
29 
30 #[cfg(unix)]
31 pub mod msr;
32 
33 pub mod acpi;
34 mod bzimage;
35 pub mod cpuid;
36 mod gdt;
37 pub mod interrupts;
38 pub mod mptable;
39 pub mod regs;
40 pub mod smbios;
41 
42 use std::arch::x86_64::CpuidResult;
43 use std::collections::BTreeMap;
44 use std::ffi::CStr;
45 use std::ffi::CString;
46 use std::fs::File;
47 use std::io;
48 use std::io::Seek;
49 use std::mem;
50 use std::path::PathBuf;
51 use std::sync::mpsc;
52 use std::sync::Arc;
53 
54 use acpi_tables::aml;
55 use acpi_tables::aml::Aml;
56 use acpi_tables::sdt::SDT;
57 use anyhow::Context;
58 use arch::get_serial_cmdline;
59 use arch::GetSerialCmdlineError;
60 use arch::MsrAction;
61 use arch::MsrConfig;
62 use arch::MsrFilter;
63 use arch::MsrRWType;
64 use arch::MsrValueFrom;
65 use arch::RunnableLinuxVm;
66 use arch::VmComponents;
67 use arch::VmImage;
68 #[cfg(feature = "seccomp_trace")]
69 use base::debug;
70 use base::warn;
71 #[cfg(unix)]
72 use base::AsRawDescriptors;
73 use base::Event;
74 use base::SendTube;
75 use base::Tube;
76 use base::TubeError;
77 use chrono::Utc;
78 pub use cpuid::adjust_cpuid;
79 pub use cpuid::CpuIdContext;
80 use devices::BusDevice;
81 use devices::BusDeviceObj;
82 use devices::BusResumeDevice;
83 use devices::Debugcon;
84 use devices::IrqChip;
85 use devices::IrqChipX86_64;
86 use devices::IrqEventSource;
87 use devices::PciAddress;
88 use devices::PciConfigIo;
89 use devices::PciConfigMmio;
90 use devices::PciDevice;
91 use devices::PciRoot;
92 use devices::PciRootCommand;
93 use devices::PciVirtualConfigMmio;
94 use devices::Pflash;
95 #[cfg(unix)]
96 use devices::ProxyDevice;
97 use devices::Serial;
98 use devices::SerialHardware;
99 use devices::SerialParameters;
100 #[cfg(all(target_arch = "x86_64", feature = "gdb"))]
101 use gdbstub_arch::x86::reg::id::X86_64CoreRegId;
102 #[cfg(all(target_arch = "x86_64", feature = "gdb"))]
103 use gdbstub_arch::x86::reg::X86SegmentRegs;
104 #[cfg(all(target_arch = "x86_64", feature = "gdb"))]
105 use gdbstub_arch::x86::reg::X86_64CoreRegs;
106 #[cfg(all(target_arch = "x86_64", feature = "gdb"))]
107 use gdbstub_arch::x86::reg::X87FpuInternalRegs;
108 #[cfg(all(target_arch = "x86_64", feature = "gdb"))]
109 use hypervisor::x86_64::Regs;
110 #[cfg(all(target_arch = "x86_64", feature = "gdb"))]
111 use hypervisor::x86_64::Sregs;
112 use hypervisor::CpuConfigX86_64;
113 use hypervisor::Hypervisor;
114 use hypervisor::HypervisorX86_64;
115 use hypervisor::ProtectionType;
116 use hypervisor::VcpuInitX86_64;
117 use hypervisor::VcpuX86_64;
118 use hypervisor::Vm;
119 use hypervisor::VmCap;
120 use hypervisor::VmX86_64;
121 #[cfg(feature = "seccomp_trace")]
122 use jail::read_jail_addr;
123 #[cfg(windows)]
124 use jail::FakeMinijailStub as Minijail;
125 #[cfg(unix)]
126 use minijail::Minijail;
127 use once_cell::sync::OnceCell;
128 use rand::rngs::OsRng;
129 use rand::RngCore;
130 use remain::sorted;
131 use resources::AddressRange;
132 use resources::SystemAllocator;
133 use resources::SystemAllocatorConfig;
134 use sync::Mutex;
135 use thiserror::Error;
136 use vm_control::BatControl;
137 use vm_control::BatteryType;
138 use vm_memory::GuestAddress;
139 use vm_memory::GuestMemory;
140 use vm_memory::GuestMemoryError;
141 use vm_memory::MemoryRegionOptions;
142 use zerocopy::AsBytes;
143 use zerocopy::FromBytes;
144 
145 use crate::bootparam::boot_params;
146 use crate::cpuid::EDX_HYBRID_CPU_SHIFT;
147 use crate::msr_index::*;
148 
149 #[sorted]
150 #[derive(Error, Debug)]
151 pub enum Error {
152     #[error("error allocating a single gpe")]
153     AllocateGpe,
154     #[error("error allocating IO resource: {0}")]
155     AllocateIOResouce(resources::Error),
156     #[error("error allocating a single irq")]
157     AllocateIrq,
158     #[error("unable to clone an Event: {0}")]
159     CloneEvent(base::Error),
160     #[error("failed to clone IRQ chip: {0}")]
161     CloneIrqChip(base::Error),
162     #[cfg(unix)]
163     #[error("failed to clone jail: {0}")]
164     CloneJail(minijail::Error),
165     #[error("unable to clone a Tube: {0}")]
166     CloneTube(TubeError),
167     #[error("the given kernel command line was invalid: {0}")]
168     Cmdline(kernel_cmdline::Error),
169     #[error("failed to configure hotplugged pci device: {0}")]
170     ConfigurePciDevice(arch::DeviceRegistrationError),
171     #[error("failed to configure segment registers: {0}")]
172     ConfigureSegments(regs::Error),
173     #[error("error configuring the system")]
174     ConfigureSystem,
175     #[error("unable to create ACPI tables")]
176     CreateAcpi,
177     #[error("unable to create battery devices: {0}")]
178     CreateBatDevices(arch::DeviceRegistrationError),
179     #[error("could not create debugcon device: {0}")]
180     CreateDebugconDevice(devices::SerialError),
181     #[error("unable to make an Event: {0}")]
182     CreateEvent(base::Error),
183     #[error("failed to create fdt: {0}")]
184     CreateFdt(cros_fdt::Error),
185     #[error("failed to create IOAPIC device: {0}")]
186     CreateIoapicDevice(base::Error),
187     #[error("failed to create a PCI root hub: {0}")]
188     CreatePciRoot(arch::DeviceRegistrationError),
189     #[error("unable to create PIT: {0}")]
190     CreatePit(base::Error),
191     #[error("unable to make PIT device: {0}")]
192     CreatePitDevice(devices::PitError),
193     #[cfg(unix)]
194     #[error("unable to create proxy device: {0}")]
195     CreateProxyDevice(devices::ProxyError),
196     #[error("unable to create serial devices: {0}")]
197     CreateSerialDevices(arch::DeviceRegistrationError),
198     #[error("failed to create socket: {0}")]
199     CreateSocket(io::Error),
200     #[error("failed to create VCPU: {0}")]
201     CreateVcpu(base::Error),
202     #[error("failed to create Virtio MMIO bus: {0}")]
203     CreateVirtioMmioBus(arch::DeviceRegistrationError),
204     #[error("invalid e820 setup params")]
205     E820Configuration,
206     #[cfg(feature = "direct")]
207     #[error("failed to enable ACPI event forwarding: {0}")]
208     EnableAcpiEvent(devices::DirectIrqError),
209     #[error("failed to enable singlestep execution: {0}")]
210     EnableSinglestep(base::Error),
211     #[error("failed to enable split irqchip: {0}")]
212     EnableSplitIrqchip(base::Error),
213     #[error("failed to get serial cmdline: {0}")]
214     GetSerialCmdline(GetSerialCmdlineError),
215     #[error("failed to insert device onto bus: {0}")]
216     InsertBus(devices::BusError),
217     #[error("the kernel extends past the end of RAM")]
218     InvalidCpuConfig,
219     #[error("invalid CPU config parameters")]
220     KernelOffsetPastEnd,
221     #[error("error loading bios: {0}")]
222     LoadBios(io::Error),
223     #[error("error loading kernel bzImage: {0}")]
224     LoadBzImage(bzimage::Error),
225     #[error("error loading command line: {0}")]
226     LoadCmdline(kernel_loader::Error),
227     #[error("error loading initrd: {0}")]
228     LoadInitrd(arch::LoadImageError),
229     #[error("error loading Kernel: {0}")]
230     LoadKernel(kernel_loader::Error),
231     #[error("error loading pflash: {0}")]
232     LoadPflash(io::Error),
233     #[error("error translating address: Page not present")]
234     PageNotPresent,
235     #[error("error reading guest memory {0}")]
236     ReadingGuestMemory(vm_memory::GuestMemoryError),
237     #[error("single register read not supported on x86_64")]
238     ReadRegIsUnsupported,
239     #[error("error reading CPU registers {0}")]
240     ReadRegs(base::Error),
241     #[error("error registering an IrqFd: {0}")]
242     RegisterIrqfd(base::Error),
243     #[error("error registering virtual socket device: {0}")]
244     RegisterVsock(arch::DeviceRegistrationError),
245     #[error("error reserved pcie config mmio")]
246     ReservePcieCfgMmio(resources::Error),
247     #[error("failed to set a hardware breakpoint: {0}")]
248     SetHwBreakpoint(base::Error),
249     #[error("failed to set interrupts: {0}")]
250     SetLint(interrupts::Error),
251     #[error("failed to set tss addr: {0}")]
252     SetTssAddr(base::Error),
253     #[error("failed to set up cmos: {0}")]
254     SetupCmos(anyhow::Error),
255     #[error("failed to set up cpuid: {0}")]
256     SetupCpuid(cpuid::Error),
257     #[error("setup data too large")]
258     SetupDataTooLarge,
259     #[error("failed to set up FPU: {0}")]
260     SetupFpu(base::Error),
261     #[error("failed to set up guest memory: {0}")]
262     SetupGuestMemory(GuestMemoryError),
263     #[error("failed to set up mptable: {0}")]
264     SetupMptable(mptable::Error),
265     #[error("failed to set up MSRs: {0}")]
266     SetupMsrs(base::Error),
267     #[error("failed to set up page tables: {0}")]
268     SetupPageTables(regs::Error),
269     #[error("failed to set up pflash: {0}")]
270     SetupPflash(anyhow::Error),
271     #[error("failed to set up registers: {0}")]
272     SetupRegs(regs::Error),
273     #[error("failed to set up SMBIOS: {0}")]
274     SetupSmbios(smbios::Error),
275     #[error("failed to set up sregs: {0}")]
276     SetupSregs(base::Error),
277     #[error("failed to translate virtual address")]
278     TranslatingVirtAddr,
279     #[error("protected VMs not supported on x86_64")]
280     UnsupportedProtectionType,
281     #[error("single register write not supported on x86_64")]
282     WriteRegIsUnsupported,
283     #[error("error writing CPU registers {0}")]
284     WriteRegs(base::Error),
285     #[error("error writing guest memory {0}")]
286     WritingGuestMemory(GuestMemoryError),
287     #[error("error writing setup_data: {0}")]
288     WritingSetupData(GuestMemoryError),
289     #[error("the zero page extends past the end of guest_mem")]
290     ZeroPagePastRamEnd,
291     #[error("error writing the zero page of guest memory")]
292     ZeroPageSetup,
293 }
294 
295 pub type Result<T> = std::result::Result<T, Error>;
296 
297 pub struct X8664arch;
298 
299 // Like `bootparam::setup_data` without the incomplete array field at the end, which allows us to
300 // safely implement Copy, Clone
301 #[repr(C)]
302 #[derive(Copy, Clone, Default, FromBytes, AsBytes)]
303 struct setup_data_hdr {
304     pub next: u64,
305     pub type_: u32,
306     pub len: u32,
307 }
308 
309 #[repr(u32)]
310 #[derive(Copy, Clone, Debug, PartialEq, Eq)]
311 pub enum SetupDataType {
312     Dtb = SETUP_DTB,
313     RngSeed = SETUP_RNG_SEED,
314 }
315 
316 /// A single entry to be inserted in the bootparam `setup_data` linked list.
317 pub struct SetupData {
318     pub data: Vec<u8>,
319     pub type_: SetupDataType,
320 }
321 
322 enum E820Type {
323     Ram = 0x01,
324     Reserved = 0x2,
325 }
326 
327 const MB: u64 = 1 << 20;
328 const GB: u64 = 1 << 30;
329 
330 pub const BOOT_STACK_POINTER: u64 = 0x8000;
331 const START_OF_RAM_32BITS: u64 = if cfg!(feature = "direct") { 0x1000 } else { 0 };
332 const FIRST_ADDR_PAST_32BITS: u64 = 1 << 32;
333 // Linux (with 4-level paging) has a physical memory limit of 46 bits (64 TiB).
334 const HIGH_MMIO_MAX_END: u64 = (1u64 << 46) - 1;
335 pub const KERNEL_64BIT_ENTRY_OFFSET: u64 = 0x200;
336 pub const ZERO_PAGE_OFFSET: u64 = 0x7000;
337 const TSS_ADDR: u64 = 0xfffb_d000;
338 
339 pub const KERNEL_START_OFFSET: u64 = 0x20_0000;
340 const CMDLINE_OFFSET: u64 = 0x2_0000;
341 const CMDLINE_MAX_SIZE: u64 = 0x800; // including terminating zero
342 const SETUP_DATA_START: u64 = CMDLINE_OFFSET + CMDLINE_MAX_SIZE;
343 const SETUP_DATA_END: u64 = ACPI_HI_RSDP_WINDOW_BASE;
344 const X86_64_SERIAL_1_3_IRQ: u32 = 4;
345 const X86_64_SERIAL_2_4_IRQ: u32 = 3;
346 // X86_64_SCI_IRQ is used to fill the ACPI FACP table.
347 // The sci_irq number is better to be a legacy
348 // IRQ number which is less than 16(actually most of the
349 // platforms have fixed IRQ number 9). So we can
350 // reserve the IRQ number 5 for SCI and let the
351 // the other devices starts from next.
352 pub const X86_64_SCI_IRQ: u32 = 5;
353 // The CMOS RTC uses IRQ 8; start allocating IRQs at 9.
354 pub const X86_64_IRQ_BASE: u32 = 9;
355 const ACPI_HI_RSDP_WINDOW_BASE: u64 = 0x000E_0000;
356 
357 #[derive(Debug, PartialEq, Eq)]
358 pub enum CpuManufacturer {
359     Intel,
360     Amd,
361     Unknown,
362 }
363 
get_cpu_manufacturer() -> CpuManufacturer364 pub fn get_cpu_manufacturer() -> CpuManufacturer {
365     cpuid::cpu_manufacturer()
366 }
367 
368 // Memory layout below 4G
369 struct LowMemoryLayout {
370     // the pci mmio range below 4G
371     pci_mmio: AddressRange,
372     // the pcie cfg mmio range
373     pcie_cfg_mmio: AddressRange,
374 }
375 
376 static LOW_MEMORY_LAYOUT: OnceCell<LowMemoryLayout> = OnceCell::new();
377 
init_low_memory_layout(pcie_ecam: Option<AddressRange>, pci_low_start: Option<u64>)378 pub fn init_low_memory_layout(pcie_ecam: Option<AddressRange>, pci_low_start: Option<u64>) {
379     LOW_MEMORY_LAYOUT.get_or_init(|| {
380         // Make sure it align to 256MB for MTRR convenient
381         const MEM_32BIT_GAP_SIZE: u64 = if cfg!(feature = "direct") {
382             // Allow space for identity mapping coreboot memory regions on the host
383             // which is found at around 7a00_0000 (little bit before 2GB)
384             //
385             // TODO(b/188011323): stop hardcoding sizes and addresses here and instead
386             // determine the memory map from how the VM has been configured via the
387             // command line.
388             2560 * MB
389         } else {
390             768 * MB
391         };
392         // Reserved memory for nand_bios/LAPIC/IOAPIC/HPET/.....
393         const RESERVED_MEM_SIZE: u64 = 0x800_0000;
394         const PCI_MMIO_END: u64 = FIRST_ADDR_PAST_32BITS - RESERVED_MEM_SIZE - 1;
395         // Reserve 64MB for pcie enhanced configuration
396         const DEFAULT_PCIE_CFG_MMIO_SIZE: u64 = 0x400_0000;
397         const DEFAULT_PCIE_CFG_MMIO_END: u64 = FIRST_ADDR_PAST_32BITS - RESERVED_MEM_SIZE - 1;
398         const DEFAULT_PCIE_CFG_MMIO_START: u64 =
399             DEFAULT_PCIE_CFG_MMIO_END - DEFAULT_PCIE_CFG_MMIO_SIZE + 1;
400         const DEFAULT_PCIE_CFG_MMIO: AddressRange = AddressRange {
401             start: DEFAULT_PCIE_CFG_MMIO_START,
402             end: DEFAULT_PCIE_CFG_MMIO_END,
403         };
404 
405         let pcie_cfg_mmio = pcie_ecam.unwrap_or(DEFAULT_PCIE_CFG_MMIO);
406 
407         let pci_mmio = if let Some(pci_low) = pci_low_start {
408             AddressRange {
409                 start: pci_low,
410                 end: PCI_MMIO_END,
411             }
412         } else {
413             AddressRange {
414                 start: pcie_cfg_mmio
415                     .start
416                     .min(FIRST_ADDR_PAST_32BITS - MEM_32BIT_GAP_SIZE),
417                 end: PCI_MMIO_END,
418             }
419         };
420 
421         LowMemoryLayout {
422             pci_mmio,
423             pcie_cfg_mmio,
424         }
425     });
426 }
427 
read_pci_mmio_before_32bit() -> AddressRange428 pub fn read_pci_mmio_before_32bit() -> AddressRange {
429     LOW_MEMORY_LAYOUT.get().unwrap().pci_mmio
430 }
read_pcie_cfg_mmio() -> AddressRange431 pub fn read_pcie_cfg_mmio() -> AddressRange {
432     LOW_MEMORY_LAYOUT.get().unwrap().pcie_cfg_mmio
433 }
434 
435 /// The x86 reset vector for i386+ and x86_64 puts the processor into an "unreal mode" where it
436 /// can access the last 1 MB of the 32-bit address space in 16-bit mode, and starts the instruction
437 /// pointer at the effective physical address 0xFFFF_FFF0.
bios_start(bios_size: u64) -> GuestAddress438 fn bios_start(bios_size: u64) -> GuestAddress {
439     GuestAddress(FIRST_ADDR_PAST_32BITS - bios_size)
440 }
441 
configure_system( guest_mem: &GuestMemory, kernel_addr: GuestAddress, cmdline_addr: GuestAddress, cmdline_size: usize, setup_data: Option<GuestAddress>, initrd: Option<(GuestAddress, usize)>, mut params: boot_params, ) -> Result<()>442 fn configure_system(
443     guest_mem: &GuestMemory,
444     kernel_addr: GuestAddress,
445     cmdline_addr: GuestAddress,
446     cmdline_size: usize,
447     setup_data: Option<GuestAddress>,
448     initrd: Option<(GuestAddress, usize)>,
449     mut params: boot_params,
450 ) -> Result<()> {
451     const EBDA_START: u64 = 0x0009_fc00;
452     const KERNEL_BOOT_FLAG_MAGIC: u16 = 0xaa55;
453     const KERNEL_HDR_MAGIC: u32 = 0x5372_6448;
454     const KERNEL_LOADER_OTHER: u8 = 0xff;
455     const KERNEL_MIN_ALIGNMENT_BYTES: u32 = 0x100_0000; // Must be non-zero.
456 
457     params.hdr.type_of_loader = KERNEL_LOADER_OTHER;
458     params.hdr.boot_flag = KERNEL_BOOT_FLAG_MAGIC;
459     params.hdr.header = KERNEL_HDR_MAGIC;
460     params.hdr.cmd_line_ptr = cmdline_addr.offset() as u32;
461     params.ext_cmd_line_ptr = (cmdline_addr.offset() >> 32) as u32;
462     params.hdr.cmdline_size = cmdline_size as u32;
463     params.hdr.kernel_alignment = KERNEL_MIN_ALIGNMENT_BYTES;
464     if let Some(setup_data) = setup_data {
465         params.hdr.setup_data = setup_data.offset();
466     }
467     if let Some((initrd_addr, initrd_size)) = initrd {
468         params.hdr.ramdisk_image = initrd_addr.offset() as u32;
469         params.hdr.ramdisk_size = initrd_size as u32;
470     }
471 
472     add_e820_entry(
473         &mut params,
474         AddressRange {
475             start: START_OF_RAM_32BITS,
476             end: EBDA_START - 1,
477         },
478         E820Type::Ram,
479     )?;
480 
481     // GuestMemory::end_addr() returns the first address past the end, so subtract 1 to get the
482     // inclusive end.
483     let guest_mem_end = guest_mem.end_addr().offset() - 1;
484     let ram_below_4g = AddressRange {
485         start: kernel_addr.offset(),
486         end: guest_mem_end.min(read_pci_mmio_before_32bit().start - 1),
487     };
488     let ram_above_4g = AddressRange {
489         start: FIRST_ADDR_PAST_32BITS,
490         end: guest_mem_end,
491     };
492     add_e820_entry(&mut params, ram_below_4g, E820Type::Ram)?;
493     if !ram_above_4g.is_empty() {
494         add_e820_entry(&mut params, ram_above_4g, E820Type::Ram)?
495     }
496 
497     let pcie_cfg_mmio_range = read_pcie_cfg_mmio();
498     add_e820_entry(&mut params, pcie_cfg_mmio_range, E820Type::Reserved)?;
499 
500     add_e820_entry(
501         &mut params,
502         X8664arch::get_pcie_vcfg_mmio_range(guest_mem, &pcie_cfg_mmio_range),
503         E820Type::Reserved,
504     )?;
505 
506     let zero_page_addr = GuestAddress(ZERO_PAGE_OFFSET);
507     if !guest_mem.is_valid_range(zero_page_addr, mem::size_of::<boot_params>() as u64) {
508         return Err(Error::ZeroPagePastRamEnd);
509     }
510 
511     guest_mem
512         .write_obj_at_addr(params, zero_page_addr)
513         .map_err(|_| Error::ZeroPageSetup)?;
514 
515     Ok(())
516 }
517 
518 /// Write setup_data entries in guest memory and link them together with the `next` field.
519 ///
520 /// Returns the guest address of the first entry in the setup_data list, if any.
write_setup_data( guest_mem: &GuestMemory, setup_data_start: GuestAddress, setup_data_end: GuestAddress, setup_data: &[SetupData], ) -> Result<Option<GuestAddress>>521 fn write_setup_data(
522     guest_mem: &GuestMemory,
523     setup_data_start: GuestAddress,
524     setup_data_end: GuestAddress,
525     setup_data: &[SetupData],
526 ) -> Result<Option<GuestAddress>> {
527     let mut setup_data_list_head = None;
528 
529     // Place the first setup_data at the first 64-bit aligned offset following setup_data_start.
530     let mut setup_data_addr = setup_data_start.align(8).ok_or(Error::SetupDataTooLarge)?;
531 
532     let mut entry_iter = setup_data.iter().peekable();
533     while let Some(entry) = entry_iter.next() {
534         if setup_data_list_head.is_none() {
535             setup_data_list_head = Some(setup_data_addr);
536         }
537 
538         // Ensure the entry (header plus data) fits into guest memory.
539         let entry_size = (mem::size_of::<setup_data_hdr>() + entry.data.len()) as u64;
540         let entry_end = setup_data_addr
541             .checked_add(entry_size)
542             .ok_or(Error::SetupDataTooLarge)?;
543 
544         if entry_end >= setup_data_end {
545             return Err(Error::SetupDataTooLarge);
546         }
547 
548         let next_setup_data_addr = if entry_iter.peek().is_some() {
549             // Place the next setup_data at a 64-bit aligned address.
550             setup_data_addr
551                 .checked_add(entry_size)
552                 .and_then(|addr| addr.align(8))
553                 .ok_or(Error::SetupDataTooLarge)?
554         } else {
555             // This is the final entry. Terminate the list with next == 0.
556             GuestAddress(0)
557         };
558 
559         let hdr = setup_data_hdr {
560             next: next_setup_data_addr.offset(),
561             type_: entry.type_ as u32,
562             len: entry
563                 .data
564                 .len()
565                 .try_into()
566                 .map_err(|_| Error::SetupDataTooLarge)?,
567         };
568 
569         guest_mem
570             .write_obj_at_addr(hdr, setup_data_addr)
571             .map_err(Error::WritingSetupData)?;
572         guest_mem
573             .write_all_at_addr(
574                 &entry.data,
575                 setup_data_addr.unchecked_add(mem::size_of::<setup_data_hdr>() as u64),
576             )
577             .map_err(Error::WritingSetupData)?;
578 
579         setup_data_addr = next_setup_data_addr;
580     }
581 
582     Ok(setup_data_list_head)
583 }
584 
585 /// Generate a SETUP_RNG_SEED SetupData with random seed data.
setup_data_rng_seed() -> SetupData586 fn setup_data_rng_seed() -> SetupData {
587     let mut data = vec![0u8; 256];
588     OsRng.fill_bytes(&mut data);
589     SetupData {
590         data,
591         type_: SetupDataType::RngSeed,
592     }
593 }
594 
595 /// Add an e820 region to the e820 map.
596 /// Returns Ok(()) if successful, or an error if there is no space left in the map.
add_e820_entry(params: &mut boot_params, range: AddressRange, mem_type: E820Type) -> Result<()>597 fn add_e820_entry(params: &mut boot_params, range: AddressRange, mem_type: E820Type) -> Result<()> {
598     if params.e820_entries >= params.e820_table.len() as u8 {
599         return Err(Error::E820Configuration);
600     }
601 
602     let size = range.len().ok_or(Error::E820Configuration)?;
603 
604     params.e820_table[params.e820_entries as usize].addr = range.start;
605     params.e820_table[params.e820_entries as usize].size = size;
606     params.e820_table[params.e820_entries as usize].type_ = mem_type as u32;
607     params.e820_entries += 1;
608 
609     Ok(())
610 }
611 
612 /// Returns a Vec of the valid memory addresses.
613 /// These should be used to configure the GuestMemory structure for the platform.
614 /// For x86_64 all addresses are valid from the start of the kernel except a
615 /// carve out at the end of 32bit address space.
arch_memory_regions( size: u64, bios_size: Option<u64>, ) -> Vec<(GuestAddress, u64, MemoryRegionOptions)>616 pub fn arch_memory_regions(
617     size: u64,
618     bios_size: Option<u64>,
619 ) -> Vec<(GuestAddress, u64, MemoryRegionOptions)> {
620     let mem_start = START_OF_RAM_32BITS;
621     let mem_end = GuestAddress(size + mem_start);
622 
623     let first_addr_past_32bits = GuestAddress(FIRST_ADDR_PAST_32BITS);
624     let end_32bit_gap_start = GuestAddress(read_pci_mmio_before_32bit().start);
625 
626     let mut regions = Vec::new();
627     if mem_end <= end_32bit_gap_start {
628         regions.push((GuestAddress(mem_start), size, Default::default()));
629         if let Some(bios_size) = bios_size {
630             regions.push((bios_start(bios_size), bios_size, Default::default()));
631         }
632     } else {
633         regions.push((
634             GuestAddress(mem_start),
635             end_32bit_gap_start.offset() - mem_start,
636             Default::default(),
637         ));
638         if let Some(bios_size) = bios_size {
639             regions.push((bios_start(bios_size), bios_size, Default::default()));
640         }
641         regions.push((
642             first_addr_past_32bits,
643             mem_end.offset_from(end_32bit_gap_start),
644             Default::default(),
645         ));
646     }
647 
648     regions
649 }
650 
651 impl arch::LinuxArch for X8664arch {
652     type Error = Error;
653 
guest_memory_layout( components: &VmComponents, _hypervisor: &impl Hypervisor, ) -> std::result::Result<Vec<(GuestAddress, u64, MemoryRegionOptions)>, Self::Error>654     fn guest_memory_layout(
655         components: &VmComponents,
656         _hypervisor: &impl Hypervisor,
657     ) -> std::result::Result<Vec<(GuestAddress, u64, MemoryRegionOptions)>, Self::Error> {
658         init_low_memory_layout(components.pcie_ecam, components.pci_low_start);
659 
660         let bios_size = match &components.vm_image {
661             VmImage::Bios(bios_file) => Some(bios_file.metadata().map_err(Error::LoadBios)?.len()),
662             VmImage::Kernel(_) => None,
663         };
664 
665         Ok(arch_memory_regions(components.memory_size, bios_size))
666     }
667 
get_system_allocator_config<V: Vm>(vm: &V) -> SystemAllocatorConfig668     fn get_system_allocator_config<V: Vm>(vm: &V) -> SystemAllocatorConfig {
669         SystemAllocatorConfig {
670             io: Some(AddressRange {
671                 start: 0xc000,
672                 end: 0xffff,
673             }),
674             low_mmio: read_pci_mmio_before_32bit(),
675             high_mmio: Self::get_high_mmio_range(vm),
676             platform_mmio: None,
677             first_irq: X86_64_IRQ_BASE,
678         }
679     }
680 
build_vm<V, Vcpu>( mut components: VmComponents, vm_evt_wrtube: &SendTube, system_allocator: &mut SystemAllocator, serial_parameters: &BTreeMap<(SerialHardware, u8), SerialParameters>, serial_jail: Option<Minijail>, battery: (Option<BatteryType>, Option<Minijail>), mut vm: V, ramoops_region: Option<arch::pstore::RamoopsRegion>, devs: Vec<(Box<dyn BusDeviceObj>, Option<Minijail>)>, irq_chip: &mut dyn IrqChipX86_64, vcpu_ids: &mut Vec<usize>, dump_device_tree_blob: Option<PathBuf>, debugcon_jail: Option<Minijail>, pflash_jail: Option<Minijail>, #[cfg(feature = "swap")] swap_controller: Option<&swap::SwapController>, ) -> std::result::Result<RunnableLinuxVm<V, Vcpu>, Self::Error> where V: VmX86_64, Vcpu: VcpuX86_64,681     fn build_vm<V, Vcpu>(
682         mut components: VmComponents,
683         vm_evt_wrtube: &SendTube,
684         system_allocator: &mut SystemAllocator,
685         serial_parameters: &BTreeMap<(SerialHardware, u8), SerialParameters>,
686         serial_jail: Option<Minijail>,
687         battery: (Option<BatteryType>, Option<Minijail>),
688         mut vm: V,
689         ramoops_region: Option<arch::pstore::RamoopsRegion>,
690         devs: Vec<(Box<dyn BusDeviceObj>, Option<Minijail>)>,
691         irq_chip: &mut dyn IrqChipX86_64,
692         vcpu_ids: &mut Vec<usize>,
693         dump_device_tree_blob: Option<PathBuf>,
694         debugcon_jail: Option<Minijail>,
695         pflash_jail: Option<Minijail>,
696         #[cfg(feature = "swap")] swap_controller: Option<&swap::SwapController>,
697     ) -> std::result::Result<RunnableLinuxVm<V, Vcpu>, Self::Error>
698     where
699         V: VmX86_64,
700         Vcpu: VcpuX86_64,
701     {
702         if components.hv_cfg.protection_type != ProtectionType::Unprotected {
703             return Err(Error::UnsupportedProtectionType);
704         }
705 
706         let mem = vm.get_memory().clone();
707 
708         let vcpu_count = components.vcpu_count;
709 
710         let tss_addr = GuestAddress(TSS_ADDR);
711         vm.set_tss_addr(tss_addr).map_err(Error::SetTssAddr)?;
712 
713         // Use IRQ info in ACPI if provided by the user.
714         let mut noirq = true;
715         let mut mptable = true;
716         let mut sci_irq = X86_64_SCI_IRQ;
717 
718         // punch pcie config mmio from pci low mmio, so that it couldn't be
719         // allocated to any device.
720         let pcie_cfg_mmio_range = read_pcie_cfg_mmio();
721         system_allocator
722             .reserve_mmio(pcie_cfg_mmio_range)
723             .map_err(Error::ReservePcieCfgMmio)?;
724 
725         for sdt in components.acpi_sdts.iter() {
726             if sdt.is_signature(b"DSDT") || sdt.is_signature(b"APIC") {
727                 noirq = false;
728             } else if sdt.is_signature(b"FACP") {
729                 mptable = false;
730                 let sci_irq_fadt: u16 = sdt.read(acpi::FADT_FIELD_SCI_INTERRUPT);
731                 sci_irq = sci_irq_fadt.into();
732                 if !system_allocator.reserve_irq(sci_irq) {
733                     warn!("sci irq {} already reserved.", sci_irq);
734                 }
735             }
736         }
737 
738         let pcie_vcfg_range = Self::get_pcie_vcfg_mmio_range(&mem, &pcie_cfg_mmio_range);
739         let mmio_bus = Arc::new(devices::Bus::new());
740         let io_bus = Arc::new(devices::Bus::new());
741 
742         let (pci_devices, devs): (Vec<_>, Vec<_>) = devs
743             .into_iter()
744             .partition(|(dev, _)| dev.as_pci_device().is_some());
745 
746         let pci_devices = pci_devices
747             .into_iter()
748             .map(|(dev, jail_orig)| (dev.into_pci_device().unwrap(), jail_orig))
749             .collect();
750 
751         let (pci, pci_irqs, mut pid_debug_label_map, amls) = arch::generate_pci_root(
752             pci_devices,
753             irq_chip.as_irq_chip_mut(),
754             mmio_bus.clone(),
755             io_bus.clone(),
756             system_allocator,
757             &mut vm,
758             4, // Share the four pin interrupts (INTx#)
759             Some(pcie_vcfg_range.start),
760             #[cfg(feature = "swap")]
761             swap_controller,
762         )
763         .map_err(Error::CreatePciRoot)?;
764 
765         let pci = Arc::new(Mutex::new(pci));
766         pci.lock().enable_pcie_cfg_mmio(pcie_cfg_mmio_range.start);
767         let pci_cfg = PciConfigIo::new(
768             pci.clone(),
769             vm_evt_wrtube.try_clone().map_err(Error::CloneTube)?,
770         );
771         let pci_bus = Arc::new(Mutex::new(pci_cfg));
772         io_bus.insert(pci_bus, 0xcf8, 0x8).unwrap();
773 
774         let pcie_cfg_mmio = Arc::new(Mutex::new(PciConfigMmio::new(pci.clone(), 12)));
775         let pcie_cfg_mmio_len = pcie_cfg_mmio_range.len().unwrap();
776         mmio_bus
777             .insert(pcie_cfg_mmio, pcie_cfg_mmio_range.start, pcie_cfg_mmio_len)
778             .unwrap();
779 
780         let pcie_vcfg_mmio = Arc::new(Mutex::new(PciVirtualConfigMmio::new(pci.clone(), 13)));
781         mmio_bus
782             .insert(
783                 pcie_vcfg_mmio,
784                 pcie_vcfg_range.start,
785                 pcie_vcfg_range.len().unwrap(),
786             )
787             .unwrap();
788 
789         let (virtio_mmio_devices, _others): (Vec<_>, Vec<_>) = devs
790             .into_iter()
791             .partition(|(dev, _)| dev.as_virtio_mmio_device().is_some());
792 
793         let virtio_mmio_devices = virtio_mmio_devices
794             .into_iter()
795             .map(|(dev, jail_orig)| (*(dev.into_virtio_mmio_device().unwrap()), jail_orig))
796             .collect();
797         let (mut virtio_mmio_pid, sdts) = arch::generate_virtio_mmio_bus(
798             virtio_mmio_devices,
799             irq_chip.as_irq_chip_mut(),
800             &mmio_bus,
801             system_allocator,
802             &mut vm,
803             components.acpi_sdts,
804             #[cfg(feature = "swap")]
805             swap_controller,
806         )
807         .map_err(Error::CreateVirtioMmioBus)?;
808         components.acpi_sdts = sdts;
809         pid_debug_label_map.append(&mut virtio_mmio_pid);
810 
811         // Event used to notify crosvm that guest OS is trying to suspend.
812         let suspend_evt = Event::new().map_err(Error::CreateEvent)?;
813 
814         if !components.no_i8042 {
815             Self::setup_legacy_i8042_device(
816                 &io_bus,
817                 irq_chip.pit_uses_speaker_port(),
818                 vm_evt_wrtube.try_clone().map_err(Error::CloneTube)?,
819             )?;
820         }
821         let vm_request_tube = if !components.no_rtc {
822             let (host_tube, device_tube) = Tube::pair()
823                 .context("create tube")
824                 .map_err(Error::SetupCmos)?;
825             Self::setup_legacy_cmos_device(&io_bus, irq_chip, device_tube, components.memory_size)
826                 .map_err(Error::SetupCmos)?;
827             Some(host_tube)
828         } else {
829             None
830         };
831         Self::setup_serial_devices(
832             components.hv_cfg.protection_type,
833             irq_chip.as_irq_chip_mut(),
834             &io_bus,
835             serial_parameters,
836             serial_jail,
837             #[cfg(feature = "swap")]
838             swap_controller,
839         )?;
840         Self::setup_debugcon_devices(
841             components.hv_cfg.protection_type,
842             &io_bus,
843             serial_parameters,
844             debugcon_jail,
845             #[cfg(feature = "swap")]
846             swap_controller,
847         )?;
848 
849         let bios_size = if let VmImage::Bios(ref bios) = components.vm_image {
850             bios.metadata().map_err(Error::LoadBios)?.len()
851         } else {
852             0
853         };
854         if let Some(pflash_image) = components.pflash_image {
855             Self::setup_pflash(
856                 pflash_image,
857                 components.pflash_block_size,
858                 bios_size,
859                 &mmio_bus,
860                 pflash_jail,
861                 #[cfg(feature = "swap")]
862                 swap_controller,
863             )?;
864         }
865 
866         // Functions that use/create jails MUST be used before the call to
867         // setup_acpi_devices below, as this move us into a multiprocessing state
868         // from which we can no longer fork.
869 
870         let mut resume_notify_devices = Vec::new();
871 
872         // each bus occupy 1MB mmio for pcie enhanced configuration
873         let max_bus = (pcie_cfg_mmio_len / 0x100000 - 1) as u8;
874         let (mut acpi_dev_resource, bat_control) = Self::setup_acpi_devices(
875             pci.clone(),
876             &mem,
877             &io_bus,
878             system_allocator,
879             suspend_evt.try_clone().map_err(Error::CloneEvent)?,
880             vm_evt_wrtube.try_clone().map_err(Error::CloneTube)?,
881             components.acpi_sdts,
882             #[cfg(feature = "direct")]
883             &components.direct_gpe,
884             #[cfg(feature = "direct")]
885             &components.direct_fixed_evts,
886             irq_chip.as_irq_chip_mut(),
887             sci_irq,
888             battery,
889             &mmio_bus,
890             max_bus,
891             &mut resume_notify_devices,
892             #[cfg(feature = "swap")]
893             swap_controller,
894             #[cfg(unix)]
895             components.ac_adapter,
896         )?;
897 
898         // Create customized SSDT table
899         let sdt = acpi::create_customize_ssdt(pci.clone(), amls);
900         if let Some(sdt) = sdt {
901             acpi_dev_resource.sdts.push(sdt);
902         }
903 
904         irq_chip
905             .finalize_devices(system_allocator, &io_bus, &mmio_bus)
906             .map_err(Error::RegisterIrqfd)?;
907 
908         // All of these bios generated tables are set manually for the benefit of the kernel boot
909         // flow (since there's no BIOS to set it) and for the BIOS boot flow since crosvm doesn't
910         // have a way to pass the BIOS these configs.
911         // This works right now because the only guest BIOS used with crosvm (u-boot) ignores these
912         // tables and the guest OS picks them up.
913         // If another guest does need a way to pass these tables down to it's BIOS, this approach
914         // should be rethought.
915 
916         if mptable {
917             // Note that this puts the mptable at 0x9FC00 in guest physical memory.
918             mptable::setup_mptable(&mem, vcpu_count as u8, &pci_irqs)
919                 .map_err(Error::SetupMptable)?;
920         }
921         smbios::setup_smbios(&mem, components.dmi_path, &components.oem_strings)
922             .map_err(Error::SetupSmbios)?;
923 
924         let host_cpus = if components.host_cpu_topology {
925             components.vcpu_affinity.clone()
926         } else {
927             None
928         };
929 
930         // TODO (tjeznach) Write RSDP to bootconfig before writing to memory
931         acpi::create_acpi_tables(
932             &mem,
933             vcpu_count as u8,
934             sci_irq,
935             0xcf9,
936             6, // RST_CPU|SYS_RST
937             &acpi_dev_resource,
938             host_cpus,
939             vcpu_ids,
940             &pci_irqs,
941             pcie_cfg_mmio_range.start,
942             max_bus,
943             components.force_s2idle,
944         )
945         .ok_or(Error::CreateAcpi)?;
946 
947         let mut cmdline = Self::get_base_linux_cmdline();
948 
949         if noirq {
950             cmdline.insert_str("acpi=noirq").unwrap();
951         }
952 
953         get_serial_cmdline(&mut cmdline, serial_parameters, "io")
954             .map_err(Error::GetSerialCmdline)?;
955 
956         for param in components.extra_kernel_params {
957             cmdline.insert_str(&param).map_err(Error::Cmdline)?;
958         }
959 
960         if let Some(ramoops_region) = ramoops_region {
961             arch::pstore::add_ramoops_kernel_cmdline(&mut cmdline, &ramoops_region)
962                 .map_err(Error::Cmdline)?;
963         }
964 
965         let pci_start = read_pci_mmio_before_32bit().start;
966 
967         let mut vcpu_init = vec![VcpuInitX86_64::default(); vcpu_count];
968 
969         let mut msrs;
970         match components.vm_image {
971             VmImage::Bios(ref mut bios) => {
972                 // Allow a bios to hardcode CMDLINE_OFFSET and read the kernel command line from it.
973                 kernel_loader::load_cmdline(
974                     &mem,
975                     GuestAddress(CMDLINE_OFFSET),
976                     &CString::new(cmdline).unwrap(),
977                 )
978                 .map_err(Error::LoadCmdline)?;
979                 Self::load_bios(&mem, bios)?;
980                 msrs = regs::default_msrs();
981                 // The default values for `Regs` and `Sregs` already set up the reset vector.
982             }
983             VmImage::Kernel(ref mut kernel_image) => {
984                 let (params, kernel_end, kernel_entry) = Self::load_kernel(&mem, kernel_image)?;
985 
986                 Self::setup_system_memory(
987                     &mem,
988                     &CString::new(cmdline).unwrap(),
989                     components.initrd_image,
990                     components.android_fstab,
991                     kernel_end,
992                     params,
993                     dump_device_tree_blob,
994                 )?;
995 
996                 // Configure the bootstrap VCPU for the Linux/x86 64-bit boot protocol.
997                 // <https://www.kernel.org/doc/html/latest/x86/boot.html>
998                 vcpu_init[0].regs.rip = kernel_entry.offset();
999                 vcpu_init[0].regs.rsp = BOOT_STACK_POINTER;
1000                 vcpu_init[0].regs.rsi = ZERO_PAGE_OFFSET;
1001 
1002                 msrs = regs::long_mode_msrs();
1003                 msrs.append(&mut regs::mtrr_msrs(&vm, pci_start));
1004 
1005                 // Set up long mode and enable paging.
1006                 regs::configure_segments_and_sregs(&mem, &mut vcpu_init[0].sregs)
1007                     .map_err(Error::ConfigureSegments)?;
1008                 regs::setup_page_tables(&mem, &mut vcpu_init[0].sregs)
1009                     .map_err(Error::SetupPageTables)?;
1010             }
1011         }
1012 
1013         // Initialize MSRs for all VCPUs.
1014         for vcpu in vcpu_init.iter_mut() {
1015             vcpu.msrs = msrs.clone();
1016         }
1017 
1018         Ok(RunnableLinuxVm {
1019             vm,
1020             vcpu_count,
1021             vcpus: None,
1022             vcpu_affinity: components.vcpu_affinity,
1023             vcpu_init,
1024             no_smt: components.no_smt,
1025             irq_chip: irq_chip.try_box_clone().map_err(Error::CloneIrqChip)?,
1026             has_bios: matches!(components.vm_image, VmImage::Bios(_)),
1027             io_bus,
1028             mmio_bus,
1029             pid_debug_label_map,
1030             suspend_evt,
1031             resume_notify_devices,
1032             rt_cpus: components.rt_cpus,
1033             delay_rt: components.delay_rt,
1034             bat_control,
1035             #[cfg(all(target_arch = "x86_64", feature = "gdb"))]
1036             gdb: components.gdb,
1037             pm: Some(acpi_dev_resource.pm),
1038             root_config: pci,
1039             #[cfg(unix)]
1040             platform_devices: Vec::new(),
1041             hotplug_bus: BTreeMap::new(),
1042             devices_thread: None,
1043             vm_request_tube,
1044         })
1045     }
1046 
configure_vcpu<V: Vm>( vm: &V, hypervisor: &dyn HypervisorX86_64, irq_chip: &mut dyn IrqChipX86_64, vcpu: &mut dyn VcpuX86_64, vcpu_init: VcpuInitX86_64, vcpu_id: usize, num_cpus: usize, _has_bios: bool, cpu_config: Option<CpuConfigX86_64>, ) -> Result<()>1047     fn configure_vcpu<V: Vm>(
1048         vm: &V,
1049         hypervisor: &dyn HypervisorX86_64,
1050         irq_chip: &mut dyn IrqChipX86_64,
1051         vcpu: &mut dyn VcpuX86_64,
1052         vcpu_init: VcpuInitX86_64,
1053         vcpu_id: usize,
1054         num_cpus: usize,
1055         _has_bios: bool,
1056         cpu_config: Option<CpuConfigX86_64>,
1057     ) -> Result<()> {
1058         let cpu_config = match cpu_config {
1059             Some(config) => config,
1060             None => return Err(Error::InvalidCpuConfig),
1061         };
1062         if !vm.check_capability(VmCap::EarlyInitCpuid) {
1063             cpuid::setup_cpuid(hypervisor, irq_chip, vcpu, vcpu_id, num_cpus, cpu_config)
1064                 .map_err(Error::SetupCpuid)?;
1065         }
1066 
1067         vcpu.set_regs(&vcpu_init.regs).map_err(Error::WriteRegs)?;
1068 
1069         vcpu.set_sregs(&vcpu_init.sregs)
1070             .map_err(Error::SetupSregs)?;
1071 
1072         vcpu.set_fpu(&vcpu_init.fpu).map_err(Error::SetupFpu)?;
1073 
1074         let vcpu_supported_var_mtrrs = regs::vcpu_supported_variable_mtrrs(vcpu);
1075         let num_var_mtrrs = regs::count_variable_mtrrs(&vcpu_init.msrs);
1076         let msrs = if num_var_mtrrs > vcpu_supported_var_mtrrs {
1077             warn!(
1078                 "Too many variable MTRR entries ({} required, {} supported),
1079                 please check pci_start addr, guest with pass through device may be very slow",
1080                 num_var_mtrrs, vcpu_supported_var_mtrrs,
1081             );
1082             // Filter out the MTRR entries from the MSR list.
1083             vcpu_init
1084                 .msrs
1085                 .into_iter()
1086                 .filter(|&msr| !regs::is_mtrr_msr(msr.id))
1087                 .collect()
1088         } else {
1089             vcpu_init.msrs
1090         };
1091 
1092         vcpu.set_msrs(&msrs).map_err(Error::SetupMsrs)?;
1093 
1094         interrupts::set_lint(vcpu_id, irq_chip).map_err(Error::SetLint)?;
1095 
1096         Ok(())
1097     }
1098 
register_pci_device<V: VmX86_64, Vcpu: VcpuX86_64>( linux: &mut RunnableLinuxVm<V, Vcpu>, device: Box<dyn PciDevice>, #[cfg(unix)] minijail: Option<Minijail>, resources: &mut SystemAllocator, hp_control_tube: &mpsc::Sender<PciRootCommand>, #[cfg(feature = "swap")] swap_controller: Option<&swap::SwapController>, ) -> Result<PciAddress>1099     fn register_pci_device<V: VmX86_64, Vcpu: VcpuX86_64>(
1100         linux: &mut RunnableLinuxVm<V, Vcpu>,
1101         device: Box<dyn PciDevice>,
1102         #[cfg(unix)] minijail: Option<Minijail>,
1103         resources: &mut SystemAllocator,
1104         hp_control_tube: &mpsc::Sender<PciRootCommand>,
1105         #[cfg(feature = "swap")] swap_controller: Option<&swap::SwapController>,
1106     ) -> Result<PciAddress> {
1107         arch::configure_pci_device(
1108             linux,
1109             device,
1110             #[cfg(unix)]
1111             minijail,
1112             resources,
1113             hp_control_tube,
1114             #[cfg(feature = "swap")]
1115             swap_controller,
1116         )
1117         .map_err(Error::ConfigurePciDevice)
1118     }
1119 }
1120 
1121 #[cfg(all(target_arch = "x86_64", feature = "gdb"))]
1122 impl<T: VcpuX86_64> arch::GdbOps<T> for X8664arch {
1123     type Error = Error;
1124 
read_registers(vcpu: &T) -> Result<X86_64CoreRegs>1125     fn read_registers(vcpu: &T) -> Result<X86_64CoreRegs> {
1126         // General registers: RAX, RBX, RCX, RDX, RSI, RDI, RBP, RSP, r8-r15
1127         let gregs = vcpu.get_regs().map_err(Error::ReadRegs)?;
1128         let regs = [
1129             gregs.rax, gregs.rbx, gregs.rcx, gregs.rdx, gregs.rsi, gregs.rdi, gregs.rbp, gregs.rsp,
1130             gregs.r8, gregs.r9, gregs.r10, gregs.r11, gregs.r12, gregs.r13, gregs.r14, gregs.r15,
1131         ];
1132 
1133         // GDB exposes 32-bit eflags instead of 64-bit rflags.
1134         // https://github.com/bminor/binutils-gdb/blob/master/gdb/features/i386/64bit-core.xml
1135         let eflags = gregs.rflags as u32;
1136         let rip = gregs.rip;
1137 
1138         // Segment registers: CS, SS, DS, ES, FS, GS
1139         let sregs = vcpu.get_sregs().map_err(Error::ReadRegs)?;
1140         let segments = X86SegmentRegs {
1141             cs: sregs.cs.selector as u32,
1142             ss: sregs.ss.selector as u32,
1143             ds: sregs.ds.selector as u32,
1144             es: sregs.es.selector as u32,
1145             fs: sregs.fs.selector as u32,
1146             gs: sregs.gs.selector as u32,
1147         };
1148 
1149         // x87 FPU internal state
1150         // TODO(dverkamp): floating point tag word, instruction pointer, and data pointer
1151         let fpu = vcpu.get_fpu().map_err(Error::ReadRegs)?;
1152         let fpu_internal = X87FpuInternalRegs {
1153             fctrl: u32::from(fpu.fcw),
1154             fstat: u32::from(fpu.fsw),
1155             fop: u32::from(fpu.last_opcode),
1156             ..Default::default()
1157         };
1158 
1159         let mut regs = X86_64CoreRegs {
1160             regs,
1161             eflags,
1162             rip,
1163             segments,
1164             st: Default::default(),
1165             fpu: fpu_internal,
1166             xmm: Default::default(),
1167             mxcsr: fpu.mxcsr,
1168         };
1169 
1170         // x87 FPU registers: ST0-ST7
1171         for (dst, src) in regs.st.iter_mut().zip(fpu.fpr.iter()) {
1172             // `fpr` contains the x87 floating point registers in FXSAVE format.
1173             // Each element contains an 80-bit floating point value in the low 10 bytes.
1174             // The upper 6 bytes are reserved and can be ignored.
1175             dst.copy_from_slice(&src[0..10])
1176         }
1177 
1178         // SSE registers: XMM0-XMM15
1179         for (dst, src) in regs.xmm.iter_mut().zip(fpu.xmm.iter()) {
1180             *dst = u128::from_le_bytes(*src);
1181         }
1182 
1183         Ok(regs)
1184     }
1185 
write_registers(vcpu: &T, regs: &X86_64CoreRegs) -> Result<()>1186     fn write_registers(vcpu: &T, regs: &X86_64CoreRegs) -> Result<()> {
1187         // General purpose registers (RAX, RBX, RCX, RDX, RSI, RDI, RBP, RSP, r8-r15) + RIP + rflags
1188         let orig_gregs = vcpu.get_regs().map_err(Error::ReadRegs)?;
1189         let gregs = Regs {
1190             rax: regs.regs[0],
1191             rbx: regs.regs[1],
1192             rcx: regs.regs[2],
1193             rdx: regs.regs[3],
1194             rsi: regs.regs[4],
1195             rdi: regs.regs[5],
1196             rbp: regs.regs[6],
1197             rsp: regs.regs[7],
1198             r8: regs.regs[8],
1199             r9: regs.regs[9],
1200             r10: regs.regs[10],
1201             r11: regs.regs[11],
1202             r12: regs.regs[12],
1203             r13: regs.regs[13],
1204             r14: regs.regs[14],
1205             r15: regs.regs[15],
1206             rip: regs.rip,
1207             // Update the lower 32 bits of rflags.
1208             rflags: (orig_gregs.rflags & !(u32::MAX as u64)) | (regs.eflags as u64),
1209         };
1210         vcpu.set_regs(&gregs).map_err(Error::WriteRegs)?;
1211 
1212         // Segment registers: CS, SS, DS, ES, FS, GS
1213         // Since GDB care only selectors, we call get_sregs() first.
1214         let mut sregs = vcpu.get_sregs().map_err(Error::ReadRegs)?;
1215         sregs.cs.selector = regs.segments.cs as u16;
1216         sregs.ss.selector = regs.segments.ss as u16;
1217         sregs.ds.selector = regs.segments.ds as u16;
1218         sregs.es.selector = regs.segments.es as u16;
1219         sregs.fs.selector = regs.segments.fs as u16;
1220         sregs.gs.selector = regs.segments.gs as u16;
1221 
1222         vcpu.set_sregs(&sregs).map_err(Error::WriteRegs)?;
1223 
1224         // FPU and SSE registers
1225         let mut fpu = vcpu.get_fpu().map_err(Error::ReadRegs)?;
1226         fpu.fcw = regs.fpu.fctrl as u16;
1227         fpu.fsw = regs.fpu.fstat as u16;
1228         fpu.last_opcode = regs.fpu.fop as u16;
1229         // TODO(dverkamp): floating point tag word, instruction pointer, and data pointer
1230 
1231         // x87 FPU registers: ST0-ST7
1232         for (dst, src) in fpu.fpr.iter_mut().zip(regs.st.iter()) {
1233             dst[0..10].copy_from_slice(src);
1234         }
1235 
1236         // SSE registers: XMM0-XMM15
1237         for (dst, src) in fpu.xmm.iter_mut().zip(regs.xmm.iter()) {
1238             dst.copy_from_slice(&src.to_le_bytes());
1239         }
1240 
1241         vcpu.set_fpu(&fpu).map_err(Error::WriteRegs)?;
1242 
1243         Ok(())
1244     }
1245 
1246     #[inline]
read_register(_vcpu: &T, _reg: X86_64CoreRegId) -> Result<Vec<u8>>1247     fn read_register(_vcpu: &T, _reg: X86_64CoreRegId) -> Result<Vec<u8>> {
1248         Err(Error::ReadRegIsUnsupported)
1249     }
1250 
1251     #[inline]
write_register(_vcpu: &T, _reg: X86_64CoreRegId, _buf: &[u8]) -> Result<()>1252     fn write_register(_vcpu: &T, _reg: X86_64CoreRegId, _buf: &[u8]) -> Result<()> {
1253         Err(Error::WriteRegIsUnsupported)
1254     }
1255 
read_memory( vcpu: &T, guest_mem: &GuestMemory, vaddr: GuestAddress, len: usize, ) -> Result<Vec<u8>>1256     fn read_memory(
1257         vcpu: &T,
1258         guest_mem: &GuestMemory,
1259         vaddr: GuestAddress,
1260         len: usize,
1261     ) -> Result<Vec<u8>> {
1262         let sregs = vcpu.get_sregs().map_err(Error::ReadRegs)?;
1263         let mut buf = vec![0; len];
1264         let mut total_read = 0u64;
1265         // Handle reads across page boundaries.
1266 
1267         while total_read < len as u64 {
1268             let (paddr, psize) = phys_addr(guest_mem, vaddr.0 + total_read, &sregs)?;
1269             let read_len = std::cmp::min(len as u64 - total_read, psize - (paddr & (psize - 1)));
1270             guest_mem
1271                 .get_slice_at_addr(GuestAddress(paddr), read_len as usize)
1272                 .map_err(Error::ReadingGuestMemory)?
1273                 .copy_to(&mut buf[total_read as usize..]);
1274             total_read += read_len;
1275         }
1276         Ok(buf)
1277     }
1278 
write_memory( vcpu: &T, guest_mem: &GuestMemory, vaddr: GuestAddress, buf: &[u8], ) -> Result<()>1279     fn write_memory(
1280         vcpu: &T,
1281         guest_mem: &GuestMemory,
1282         vaddr: GuestAddress,
1283         buf: &[u8],
1284     ) -> Result<()> {
1285         let sregs = vcpu.get_sregs().map_err(Error::ReadRegs)?;
1286         let mut total_written = 0u64;
1287         // Handle writes across page boundaries.
1288         while total_written < buf.len() as u64 {
1289             let (paddr, psize) = phys_addr(guest_mem, vaddr.0 + total_written, &sregs)?;
1290             let write_len = std::cmp::min(
1291                 buf.len() as u64 - total_written,
1292                 psize - (paddr & (psize - 1)),
1293             );
1294 
1295             guest_mem
1296                 .write_all_at_addr(
1297                     &buf[total_written as usize..(total_written as usize + write_len as usize)],
1298                     GuestAddress(paddr),
1299                 )
1300                 .map_err(Error::WritingGuestMemory)?;
1301             total_written += write_len;
1302         }
1303         Ok(())
1304     }
1305 
enable_singlestep(vcpu: &T) -> Result<()>1306     fn enable_singlestep(vcpu: &T) -> Result<()> {
1307         vcpu.set_guest_debug(&[], true /* enable_singlestep */)
1308             .map_err(Error::EnableSinglestep)
1309     }
1310 
get_max_hw_breakpoints(_vcpu: &T) -> Result<usize>1311     fn get_max_hw_breakpoints(_vcpu: &T) -> Result<usize> {
1312         Ok(4usize)
1313     }
1314 
set_hw_breakpoints(vcpu: &T, breakpoints: &[GuestAddress]) -> Result<()>1315     fn set_hw_breakpoints(vcpu: &T, breakpoints: &[GuestAddress]) -> Result<()> {
1316         vcpu.set_guest_debug(breakpoints, false /* enable_singlestep */)
1317             .map_err(Error::SetHwBreakpoint)
1318     }
1319 }
1320 
1321 #[cfg(all(target_arch = "x86_64", feature = "gdb"))]
1322 // return the translated address and the size of the page it resides in.
phys_addr(mem: &GuestMemory, vaddr: u64, sregs: &Sregs) -> Result<(u64, u64)>1323 fn phys_addr(mem: &GuestMemory, vaddr: u64, sregs: &Sregs) -> Result<(u64, u64)> {
1324     const CR0_PG_MASK: u64 = 1 << 31;
1325     const CR4_PAE_MASK: u64 = 1 << 5;
1326     const CR4_LA57_MASK: u64 = 1 << 12;
1327     const MSR_EFER_LMA: u64 = 1 << 10;
1328     // bits 12 through 51 are the address in a PTE.
1329     const PTE_ADDR_MASK: u64 = ((1 << 52) - 1) & !0x0fff;
1330     const PAGE_PRESENT: u64 = 0x1;
1331     const PAGE_PSE_MASK: u64 = 0x1 << 7;
1332 
1333     const PAGE_SIZE_4K: u64 = 4 * 1024;
1334     const PAGE_SIZE_2M: u64 = 2 * 1024 * 1024;
1335     const PAGE_SIZE_1G: u64 = 1024 * 1024 * 1024;
1336 
1337     fn next_pte(mem: &GuestMemory, curr_table_addr: u64, vaddr: u64, level: usize) -> Result<u64> {
1338         let ent: u64 = mem
1339             .read_obj_from_addr(GuestAddress(
1340                 (curr_table_addr & PTE_ADDR_MASK) + page_table_offset(vaddr, level),
1341             ))
1342             .map_err(|_| Error::TranslatingVirtAddr)?;
1343         /* TODO - convert to a trace
1344         println!(
1345             "level {} vaddr {:x} table-addr {:x} mask {:x} ent {:x} offset {:x}",
1346             level,
1347             vaddr,
1348             curr_table_addr,
1349             PTE_ADDR_MASK,
1350             ent,
1351             page_table_offset(vaddr, level)
1352         );
1353         */
1354         if ent & PAGE_PRESENT == 0 {
1355             return Err(Error::PageNotPresent);
1356         }
1357         Ok(ent)
1358     }
1359 
1360     // Get the offset in to the page of `vaddr`.
1361     fn page_offset(vaddr: u64, page_size: u64) -> u64 {
1362         vaddr & (page_size - 1)
1363     }
1364 
1365     // Get the offset in to the page table of the given `level` specified by the virtual `address`.
1366     // `level` is 1 through 5 in x86_64 to handle the five levels of paging.
1367     fn page_table_offset(addr: u64, level: usize) -> u64 {
1368         let offset = (level - 1) * 9 + 12;
1369         ((addr >> offset) & 0x1ff) << 3
1370     }
1371 
1372     if sregs.cr0 & CR0_PG_MASK == 0 {
1373         return Ok((vaddr, PAGE_SIZE_4K));
1374     }
1375 
1376     if sregs.cr4 & CR4_PAE_MASK == 0 {
1377         return Err(Error::TranslatingVirtAddr);
1378     }
1379 
1380     if sregs.efer & MSR_EFER_LMA != 0 {
1381         // TODO - check LA57
1382         if sregs.cr4 & CR4_LA57_MASK != 0 {}
1383         let p4_ent = next_pte(mem, sregs.cr3, vaddr, 4)?;
1384         let p3_ent = next_pte(mem, p4_ent, vaddr, 3)?;
1385         // TODO check if it's a 1G page with the PSE bit in p2_ent
1386         if p3_ent & PAGE_PSE_MASK != 0 {
1387             // It's a 1G page with the PSE bit in p3_ent
1388             let paddr = p3_ent & PTE_ADDR_MASK | page_offset(vaddr, PAGE_SIZE_1G);
1389             return Ok((paddr, PAGE_SIZE_1G));
1390         }
1391         let p2_ent = next_pte(mem, p3_ent, vaddr, 2)?;
1392         if p2_ent & PAGE_PSE_MASK != 0 {
1393             // It's a 2M page with the PSE bit in p2_ent
1394             let paddr = p2_ent & PTE_ADDR_MASK | page_offset(vaddr, PAGE_SIZE_2M);
1395             return Ok((paddr, PAGE_SIZE_2M));
1396         }
1397         let p1_ent = next_pte(mem, p2_ent, vaddr, 1)?;
1398         let paddr = p1_ent & PTE_ADDR_MASK | page_offset(vaddr, PAGE_SIZE_4K);
1399         return Ok((paddr, PAGE_SIZE_4K));
1400     }
1401     Err(Error::TranslatingVirtAddr)
1402 }
1403 
1404 // OSC returned status register in CDW1
1405 const OSC_STATUS_UNSUPPORT_UUID: u32 = 0x4;
1406 // pci host bridge OSC returned control register in CDW3
1407 #[allow(dead_code)]
1408 const PCI_HB_OSC_CONTROL_PCIE_HP: u32 = 0x1;
1409 const PCI_HB_OSC_CONTROL_SHPC_HP: u32 = 0x2;
1410 #[allow(dead_code)]
1411 const PCI_HB_OSC_CONTROL_PCIE_PME: u32 = 0x4;
1412 const PCI_HB_OSC_CONTROL_PCIE_AER: u32 = 0x8;
1413 #[allow(dead_code)]
1414 const PCI_HB_OSC_CONTROL_PCIE_CAP: u32 = 0x10;
1415 
1416 struct PciRootOSC {}
1417 
1418 // Method (_OSC, 4, NotSerialized)  // _OSC: Operating System Capabilities
1419 // {
1420 //     CreateDWordField (Arg3, Zero, CDW1)  // flag and return value
1421 //     If (Arg0 == ToUUID ("33db4d5b-1ff7-401c-9657-7441c03dd766"))
1422 //     {
1423 //         CreateDWordField (Arg3, 8, CDW3) // control field
1424 //         if ( 0 == (CDW1 & 0x01))  // Query flag ?
1425 //         {
1426 //              CDW3 &= !(SHPC_HP | AER)
1427 //         }
1428 //     } Else {
1429 //         CDW1 |= UNSUPPORT_UUID
1430 //     }
1431 //     Return (Arg3)
1432 // }
1433 impl Aml for PciRootOSC {
to_aml_bytes(&self, aml: &mut Vec<u8>)1434     fn to_aml_bytes(&self, aml: &mut Vec<u8>) {
1435         let osc_uuid = "33DB4D5B-1FF7-401C-9657-7441C03DD766";
1436         // virtual pcie root port supports hotplug, pme, and pcie cap register, clear all
1437         // the other bits.
1438         let mask = !(PCI_HB_OSC_CONTROL_SHPC_HP | PCI_HB_OSC_CONTROL_PCIE_AER);
1439         aml::Method::new(
1440             "_OSC".into(),
1441             4,
1442             false,
1443             vec![
1444                 &aml::CreateDWordField::new(
1445                     &aml::Name::new_field_name("CDW1"),
1446                     &aml::Arg(3),
1447                     &aml::ZERO,
1448                 ),
1449                 &aml::If::new(
1450                     &aml::Equal::new(&aml::Arg(0), &aml::Uuid::new(osc_uuid)),
1451                     vec![
1452                         &aml::CreateDWordField::new(
1453                             &aml::Name::new_field_name("CDW3"),
1454                             &aml::Arg(3),
1455                             &(8_u8),
1456                         ),
1457                         &aml::If::new(
1458                             &aml::Equal::new(
1459                                 &aml::ZERO,
1460                                 &aml::And::new(
1461                                     &aml::ZERO,
1462                                     &aml::Name::new_field_name("CDW1"),
1463                                     &aml::ONE,
1464                                 ),
1465                             ),
1466                             vec![&aml::And::new(
1467                                 &aml::Name::new_field_name("CDW3"),
1468                                 &mask,
1469                                 &aml::Name::new_field_name("CDW3"),
1470                             )],
1471                         ),
1472                     ],
1473                 ),
1474                 &aml::Else::new(vec![&aml::Or::new(
1475                     &aml::Name::new_field_name("CDW1"),
1476                     &OSC_STATUS_UNSUPPORT_UUID,
1477                     &aml::Name::new_field_name("CDW1"),
1478                 )]),
1479                 &aml::Return::new(&aml::Arg(3)),
1480             ],
1481         )
1482         .to_aml_bytes(aml)
1483     }
1484 }
1485 
1486 impl X8664arch {
1487     /// Loads the bios from an open file.
1488     ///
1489     /// # Arguments
1490     ///
1491     /// * `mem` - The memory to be used by the guest.
1492     /// * `bios_image` - the File object for the specified bios
load_bios(mem: &GuestMemory, bios_image: &mut File) -> Result<()>1493     fn load_bios(mem: &GuestMemory, bios_image: &mut File) -> Result<()> {
1494         let bios_image_length = bios_image
1495             .seek(io::SeekFrom::End(0))
1496             .map_err(Error::LoadBios)?;
1497         if bios_image_length >= FIRST_ADDR_PAST_32BITS {
1498             return Err(Error::LoadBios(io::Error::new(
1499                 io::ErrorKind::InvalidData,
1500                 format!(
1501                     "bios was {} bytes, expected less than {}",
1502                     bios_image_length, FIRST_ADDR_PAST_32BITS,
1503                 ),
1504             )));
1505         }
1506         bios_image
1507             .seek(io::SeekFrom::Start(0))
1508             .map_err(Error::LoadBios)?;
1509         mem.read_to_memory(
1510             bios_start(bios_image_length),
1511             bios_image,
1512             bios_image_length as usize,
1513         )
1514         .map_err(Error::SetupGuestMemory)?;
1515         Ok(())
1516     }
1517 
setup_pflash( pflash_image: File, block_size: u32, bios_size: u64, mmio_bus: &devices::Bus, jail: Option<Minijail>, #[cfg(feature = "swap")] swap_controller: Option<&swap::SwapController>, ) -> Result<()>1518     fn setup_pflash(
1519         pflash_image: File,
1520         block_size: u32,
1521         bios_size: u64,
1522         mmio_bus: &devices::Bus,
1523         jail: Option<Minijail>,
1524         #[cfg(feature = "swap")] swap_controller: Option<&swap::SwapController>,
1525     ) -> Result<()> {
1526         let size = pflash_image.metadata().map_err(Error::LoadPflash)?.len();
1527         let start = FIRST_ADDR_PAST_32BITS - bios_size - size;
1528         let pflash_image = Box::new(pflash_image);
1529 
1530         #[cfg(unix)]
1531         let fds = pflash_image.as_raw_descriptors();
1532 
1533         let pflash = Pflash::new(pflash_image, block_size).map_err(Error::SetupPflash)?;
1534         let pflash: Arc<Mutex<dyn BusDevice>> = match jail {
1535             #[cfg(unix)]
1536             Some(jail) => Arc::new(Mutex::new(
1537                 ProxyDevice::new(
1538                     pflash,
1539                     jail,
1540                     fds,
1541                     #[cfg(feature = "swap")]
1542                     swap_controller,
1543                 )
1544                 .map_err(Error::CreateProxyDevice)?,
1545             )),
1546             #[cfg(windows)]
1547             Some(_) => unreachable!(),
1548             None => Arc::new(Mutex::new(pflash)),
1549         };
1550         mmio_bus
1551             .insert(pflash, start, size)
1552             .map_err(Error::InsertBus)?;
1553 
1554         Ok(())
1555     }
1556 
1557     /// Loads the kernel from an open file.
1558     ///
1559     /// # Arguments
1560     ///
1561     /// * `mem` - The memory to be used by the guest.
1562     /// * `kernel_image` - the File object for the specified kernel.
1563     ///
1564     /// # Returns
1565     ///
1566     /// On success, returns the Linux x86_64 boot protocol parameters, the first address past the
1567     /// end of the kernel, and the entry point (initial `RIP` value).
load_kernel( mem: &GuestMemory, kernel_image: &mut File, ) -> Result<(boot_params, u64, GuestAddress)>1568     fn load_kernel(
1569         mem: &GuestMemory,
1570         kernel_image: &mut File,
1571     ) -> Result<(boot_params, u64, GuestAddress)> {
1572         let kernel_start = GuestAddress(KERNEL_START_OFFSET);
1573         match kernel_loader::load_elf64(mem, kernel_start, kernel_image, 0) {
1574             Ok(loaded_kernel) => {
1575                 // ELF kernels don't contain a `boot_params` structure, so synthesize a default one.
1576                 let boot_params = Default::default();
1577                 Ok((
1578                     boot_params,
1579                     loaded_kernel.address_range.end,
1580                     loaded_kernel.entry,
1581                 ))
1582             }
1583             Err(kernel_loader::Error::InvalidMagicNumber) => {
1584                 // The image failed to parse as ELF, so try to load it as a bzImage.
1585                 let (boot_params, bzimage_end) =
1586                     bzimage::load_bzimage(mem, kernel_start, kernel_image)
1587                         .map_err(Error::LoadBzImage)?;
1588                 let bzimage_entry = mem
1589                     .checked_offset(kernel_start, KERNEL_64BIT_ENTRY_OFFSET)
1590                     .ok_or(Error::KernelOffsetPastEnd)?;
1591                 Ok((boot_params, bzimage_end, bzimage_entry))
1592             }
1593             Err(e) => Err(Error::LoadKernel(e)),
1594         }
1595     }
1596 
1597     /// Configures the system memory space should be called once per vm before
1598     /// starting vcpu threads.
1599     ///
1600     /// # Arguments
1601     ///
1602     /// * `mem` - The memory to be used by the guest.
1603     /// * `cmdline` - the kernel commandline
1604     /// * `initrd_file` - an initial ramdisk image
setup_system_memory( mem: &GuestMemory, cmdline: &CStr, initrd_file: Option<File>, android_fstab: Option<File>, kernel_end: u64, params: boot_params, dump_device_tree_blob: Option<PathBuf>, ) -> Result<()>1605     pub fn setup_system_memory(
1606         mem: &GuestMemory,
1607         cmdline: &CStr,
1608         initrd_file: Option<File>,
1609         android_fstab: Option<File>,
1610         kernel_end: u64,
1611         params: boot_params,
1612         dump_device_tree_blob: Option<PathBuf>,
1613     ) -> Result<()> {
1614         kernel_loader::load_cmdline(mem, GuestAddress(CMDLINE_OFFSET), cmdline)
1615             .map_err(Error::LoadCmdline)?;
1616 
1617         let mut setup_data = Vec::<SetupData>::new();
1618         if let Some(android_fstab) = android_fstab {
1619             setup_data.push(
1620                 fdt::create_fdt(android_fstab, dump_device_tree_blob).map_err(Error::CreateFdt)?,
1621             );
1622         }
1623         setup_data.push(setup_data_rng_seed());
1624 
1625         let setup_data = write_setup_data(
1626             mem,
1627             GuestAddress(SETUP_DATA_START),
1628             GuestAddress(SETUP_DATA_END),
1629             &setup_data,
1630         )?;
1631 
1632         let initrd = match initrd_file {
1633             Some(mut initrd_file) => {
1634                 let mut initrd_addr_max = u64::from(params.hdr.initrd_addr_max);
1635                 // Default initrd_addr_max for old kernels (see Documentation/x86/boot.txt).
1636                 if initrd_addr_max == 0 {
1637                     initrd_addr_max = 0x37FFFFFF;
1638                 }
1639 
1640                 let mem_max = mem.end_addr().offset() - 1;
1641                 if initrd_addr_max > mem_max {
1642                     initrd_addr_max = mem_max;
1643                 }
1644 
1645                 let (initrd_start, initrd_size) = arch::load_image_high(
1646                     mem,
1647                     &mut initrd_file,
1648                     GuestAddress(kernel_end),
1649                     GuestAddress(initrd_addr_max),
1650                     base::pagesize() as u64,
1651                 )
1652                 .map_err(Error::LoadInitrd)?;
1653                 Some((initrd_start, initrd_size))
1654             }
1655             None => None,
1656         };
1657 
1658         configure_system(
1659             mem,
1660             GuestAddress(KERNEL_START_OFFSET),
1661             GuestAddress(CMDLINE_OFFSET),
1662             cmdline.to_bytes().len() + 1,
1663             setup_data,
1664             initrd,
1665             params,
1666         )?;
1667         Ok(())
1668     }
1669 
get_pcie_vcfg_mmio_range(mem: &GuestMemory, pcie_cfg_mmio: &AddressRange) -> AddressRange1670     fn get_pcie_vcfg_mmio_range(mem: &GuestMemory, pcie_cfg_mmio: &AddressRange) -> AddressRange {
1671         // Put PCIe VCFG region at a 2MB boundary after physical memory or 4gb, whichever is greater.
1672         let ram_end_round_2mb = (mem.end_addr().offset() + 2 * MB - 1) / (2 * MB) * (2 * MB);
1673         let start = std::cmp::max(ram_end_round_2mb, 4 * GB);
1674         // Each pci device's ECAM size is 4kb and its vcfg size is 8kb
1675         let end = start + pcie_cfg_mmio.len().unwrap() * 2 - 1;
1676         AddressRange { start, end }
1677     }
1678 
1679     /// Returns the high mmio range
get_high_mmio_range<V: Vm>(vm: &V) -> AddressRange1680     fn get_high_mmio_range<V: Vm>(vm: &V) -> AddressRange {
1681         let mem = vm.get_memory();
1682         let start = Self::get_pcie_vcfg_mmio_range(mem, &read_pcie_cfg_mmio()).end + 1;
1683 
1684         let phys_mem_end = (1u64 << vm.get_guest_phys_addr_bits()) - 1;
1685         let high_mmio_end = std::cmp::min(phys_mem_end, HIGH_MMIO_MAX_END);
1686 
1687         AddressRange {
1688             start,
1689             end: high_mmio_end,
1690         }
1691     }
1692 
1693     /// This returns a minimal kernel command for this architecture
get_base_linux_cmdline() -> kernel_cmdline::Cmdline1694     pub fn get_base_linux_cmdline() -> kernel_cmdline::Cmdline {
1695         let mut cmdline = kernel_cmdline::Cmdline::new(CMDLINE_MAX_SIZE as usize);
1696         cmdline.insert_str("panic=-1").unwrap();
1697 
1698         cmdline
1699     }
1700 
1701     /// Sets up the legacy x86 i8042/KBD platform device
1702     ///
1703     /// # Arguments
1704     ///
1705     /// * - `io_bus` - the IO bus object
1706     /// * - `pit_uses_speaker_port` - does the PIT use port 0x61 for the PC speaker
1707     /// * - `vm_evt_wrtube` - the event object which should receive exit events
setup_legacy_i8042_device( io_bus: &devices::Bus, pit_uses_speaker_port: bool, vm_evt_wrtube: SendTube, ) -> Result<()>1708     pub fn setup_legacy_i8042_device(
1709         io_bus: &devices::Bus,
1710         pit_uses_speaker_port: bool,
1711         vm_evt_wrtube: SendTube,
1712     ) -> Result<()> {
1713         let i8042 = Arc::new(Mutex::new(devices::I8042Device::new(
1714             vm_evt_wrtube.try_clone().map_err(Error::CloneTube)?,
1715         )));
1716 
1717         if pit_uses_speaker_port {
1718             io_bus.insert(i8042, 0x062, 0x3).unwrap();
1719         } else {
1720             io_bus.insert(i8042, 0x061, 0x4).unwrap();
1721         }
1722 
1723         Ok(())
1724     }
1725 
1726     /// Sets up the legacy x86 CMOS/RTC platform device
1727     /// # Arguments
1728     ///
1729     /// * - `io_bus` - the IO bus object
1730     /// * - `mem_size` - the size in bytes of physical ram for the guest
setup_legacy_cmos_device( io_bus: &devices::Bus, irq_chip: &mut dyn IrqChipX86_64, vm_control: Tube, mem_size: u64, ) -> anyhow::Result<()>1731     pub fn setup_legacy_cmos_device(
1732         io_bus: &devices::Bus,
1733         irq_chip: &mut dyn IrqChipX86_64,
1734         vm_control: Tube,
1735         mem_size: u64,
1736     ) -> anyhow::Result<()> {
1737         let mem_regions = arch_memory_regions(mem_size, None);
1738 
1739         let mem_below_4g = mem_regions
1740             .iter()
1741             .filter(|r| r.0.offset() < FIRST_ADDR_PAST_32BITS)
1742             .map(|r| r.1)
1743             .sum();
1744 
1745         let mem_above_4g = mem_regions
1746             .iter()
1747             .filter(|r| r.0.offset() >= FIRST_ADDR_PAST_32BITS)
1748             .map(|r| r.1)
1749             .sum();
1750 
1751         let irq_evt = devices::IrqEdgeEvent::new().context("cmos irq")?;
1752         let cmos = devices::cmos::Cmos::new(
1753             mem_below_4g,
1754             mem_above_4g,
1755             Utc::now,
1756             vm_control,
1757             irq_evt.try_clone().context("cmos irq clone")?,
1758         )
1759         .context("create cmos")?;
1760 
1761         irq_chip
1762             .register_edge_irq_event(
1763                 devices::cmos::RTC_IRQ as u32,
1764                 &irq_evt,
1765                 IrqEventSource::from_device(&cmos),
1766             )
1767             .context("cmos register irq")?;
1768         io_bus
1769             .insert(Arc::new(Mutex::new(cmos)), 0x70, 0x2)
1770             .context("cmos insert irq")?;
1771 
1772         Ok(())
1773     }
1774 
1775     /// Sets up the acpi devices for this platform and
1776     /// return the resources which is used to set the ACPI tables.
1777     ///
1778     /// # Arguments
1779     ///
1780     /// * - `io_bus` the I/O bus to add the devices to
1781     /// * - `resources` the SystemAllocator to allocate IO and MMIO for acpi
1782     ///                devices.
1783     /// * - `suspend_evt` the event object which used to suspend the vm
1784     /// * - `sdts` ACPI system description tables
1785     /// * - `irq_chip` the IrqChip object for registering irq events
1786     /// * - `battery` indicate whether to create the battery
1787     /// * - `mmio_bus` the MMIO bus to add the devices to
setup_acpi_devices( pci_root: Arc<Mutex<PciRoot>>, mem: &GuestMemory, io_bus: &devices::Bus, resources: &mut SystemAllocator, suspend_evt: Event, vm_evt_wrtube: SendTube, sdts: Vec<SDT>, #[cfg(feature = "direct")] direct_gpe: &[u32], #[cfg(feature = "direct")] direct_fixed_evts: &[devices::ACPIPMFixedEvent], irq_chip: &mut dyn IrqChip, sci_irq: u32, battery: (Option<BatteryType>, Option<Minijail>), #[cfg_attr(windows, allow(unused_variables))] mmio_bus: &devices::Bus, max_bus: u8, resume_notify_devices: &mut Vec<Arc<Mutex<dyn BusResumeDevice>>>, #[cfg(feature = "swap")] swap_controller: Option<&swap::SwapController>, #[cfg(unix)] ac_adapter: bool, ) -> Result<(acpi::AcpiDevResource, Option<BatControl>)>1788     pub fn setup_acpi_devices(
1789         pci_root: Arc<Mutex<PciRoot>>,
1790         mem: &GuestMemory,
1791         io_bus: &devices::Bus,
1792         resources: &mut SystemAllocator,
1793         suspend_evt: Event,
1794         vm_evt_wrtube: SendTube,
1795         sdts: Vec<SDT>,
1796         #[cfg(feature = "direct")] direct_gpe: &[u32],
1797         #[cfg(feature = "direct")] direct_fixed_evts: &[devices::ACPIPMFixedEvent],
1798         irq_chip: &mut dyn IrqChip,
1799         sci_irq: u32,
1800         battery: (Option<BatteryType>, Option<Minijail>),
1801         #[cfg_attr(windows, allow(unused_variables))] mmio_bus: &devices::Bus,
1802         max_bus: u8,
1803         resume_notify_devices: &mut Vec<Arc<Mutex<dyn BusResumeDevice>>>,
1804         #[cfg(feature = "swap")] swap_controller: Option<&swap::SwapController>,
1805         #[cfg(unix)] ac_adapter: bool,
1806     ) -> Result<(acpi::AcpiDevResource, Option<BatControl>)> {
1807         // The AML data for the acpi devices
1808         let mut amls = Vec::new();
1809 
1810         let bat_control = if let Some(battery_type) = battery.0 {
1811             match battery_type {
1812                 #[cfg(unix)]
1813                 BatteryType::Goldfish => {
1814                     let irq_num = resources.allocate_irq().ok_or(Error::CreateBatDevices(
1815                         arch::DeviceRegistrationError::AllocateIrq,
1816                     ))?;
1817                     let (control_tube, _mmio_base) = arch::sys::unix::add_goldfish_battery(
1818                         &mut amls,
1819                         battery.1,
1820                         mmio_bus,
1821                         irq_chip,
1822                         irq_num,
1823                         resources,
1824                         #[cfg(feature = "swap")]
1825                         swap_controller,
1826                     )
1827                     .map_err(Error::CreateBatDevices)?;
1828                     Some(BatControl {
1829                         type_: BatteryType::Goldfish,
1830                         control_tube,
1831                     })
1832                 }
1833                 #[cfg(windows)]
1834                 _ => None,
1835             }
1836         } else {
1837             None
1838         };
1839 
1840         let pm_alloc = resources.get_anon_alloc();
1841         let pm_iobase = match resources.io_allocator() {
1842             Some(io) => io
1843                 .allocate_with_align(
1844                     devices::acpi::ACPIPM_RESOURCE_LEN as u64,
1845                     pm_alloc,
1846                     "ACPIPM".to_string(),
1847                     4, // must be 32-bit aligned
1848                 )
1849                 .map_err(Error::AllocateIOResouce)?,
1850             None => 0x600,
1851         };
1852 
1853         let pcie_vcfg = aml::Name::new(
1854             "VCFG".into(),
1855             &Self::get_pcie_vcfg_mmio_range(mem, &read_pcie_cfg_mmio()).start,
1856         );
1857         pcie_vcfg.to_aml_bytes(&mut amls);
1858 
1859         #[cfg(feature = "direct")]
1860         let direct_evt_info = if direct_gpe.is_empty() && direct_fixed_evts.is_empty() {
1861             None
1862         } else {
1863             let direct_sci_evt = devices::IrqLevelEvent::new().map_err(Error::CreateEvent)?;
1864             let mut sci_devirq =
1865                 devices::DirectIrq::new_level(&direct_sci_evt).map_err(Error::EnableAcpiEvent)?;
1866 
1867             sci_devirq
1868                 .sci_irq_prepare()
1869                 .map_err(Error::EnableAcpiEvent)?;
1870 
1871             for gpe in direct_gpe {
1872                 sci_devirq
1873                     .gpe_enable_forwarding(*gpe)
1874                     .map_err(Error::EnableAcpiEvent)?;
1875             }
1876 
1877             for evt in direct_fixed_evts {
1878                 sci_devirq
1879                     .fixed_event_enable_forwarding(*evt)
1880                     .map_err(Error::EnableAcpiEvent)?;
1881             }
1882 
1883             Some((direct_sci_evt, direct_gpe, direct_fixed_evts))
1884         };
1885 
1886         let pm_sci_evt = devices::IrqLevelEvent::new().map_err(Error::CreateEvent)?;
1887 
1888         #[cfg(unix)]
1889         let acdc = if ac_adapter {
1890             // Allocate GPE for AC adapter notfication
1891             let gpe = resources.allocate_gpe().ok_or(Error::AllocateGpe)?;
1892 
1893             let alloc = resources.get_anon_alloc();
1894             let mmio_base = resources
1895                 .allocate_mmio(
1896                     devices::ac_adapter::ACDC_VIRT_MMIO_SIZE,
1897                     alloc,
1898                     "AcAdapter".to_string(),
1899                     resources::AllocOptions::new().align(devices::ac_adapter::ACDC_VIRT_MMIO_SIZE),
1900                 )
1901                 .unwrap();
1902             let ac_adapter_dev = devices::ac_adapter::AcAdapter::new(mmio_base, gpe);
1903             let ac_dev = Arc::new(Mutex::new(ac_adapter_dev));
1904             mmio_bus
1905                 .insert(
1906                     ac_dev.clone(),
1907                     mmio_base,
1908                     devices::ac_adapter::ACDC_VIRT_MMIO_SIZE,
1909                 )
1910                 .unwrap();
1911 
1912             ac_dev.lock().to_aml_bytes(&mut amls);
1913             Some(ac_dev)
1914         } else {
1915             None
1916         };
1917         #[cfg(windows)]
1918         let acdc = None;
1919 
1920         let mut pmresource = devices::ACPIPMResource::new(
1921             pm_sci_evt.try_clone().map_err(Error::CloneEvent)?,
1922             #[cfg(feature = "direct")]
1923             direct_evt_info,
1924             suspend_evt,
1925             vm_evt_wrtube,
1926             acdc,
1927         );
1928         pmresource.to_aml_bytes(&mut amls);
1929         irq_chip
1930             .register_level_irq_event(
1931                 sci_irq,
1932                 &pm_sci_evt,
1933                 IrqEventSource::from_device(&pmresource),
1934             )
1935             .map_err(Error::RegisterIrqfd)?;
1936         pmresource.start();
1937 
1938         let mut crs_entries: Vec<Box<dyn Aml>> = vec![
1939             Box::new(aml::AddressSpace::new_bus_number(0x0u16, max_bus as u16)),
1940             Box::new(aml::IO::new(0xcf8, 0xcf8, 1, 0x8)),
1941         ];
1942         for r in resources.mmio_pools() {
1943             let entry: Box<dyn Aml> = match (u32::try_from(r.start), u32::try_from(r.end)) {
1944                 (Ok(start), Ok(end)) => Box::new(aml::AddressSpace::new_memory(
1945                     aml::AddressSpaceCachable::NotCacheable,
1946                     true,
1947                     start,
1948                     end,
1949                 )),
1950                 _ => Box::new(aml::AddressSpace::new_memory(
1951                     aml::AddressSpaceCachable::NotCacheable,
1952                     true,
1953                     r.start,
1954                     r.end,
1955                 )),
1956             };
1957             crs_entries.push(entry);
1958         }
1959 
1960         aml::Device::new(
1961             "_SB_.PC00".into(),
1962             vec![
1963                 &aml::Name::new("_HID".into(), &aml::EISAName::new("PNP0A08")),
1964                 &aml::Name::new("_CID".into(), &aml::EISAName::new("PNP0A03")),
1965                 &aml::Name::new("_ADR".into(), &aml::ZERO),
1966                 &aml::Name::new("_SEG".into(), &aml::ZERO),
1967                 &aml::Name::new("_UID".into(), &aml::ZERO),
1968                 &aml::Name::new("SUPP".into(), &aml::ZERO),
1969                 &aml::Name::new(
1970                     "_CRS".into(),
1971                     &aml::ResourceTemplate::new(crs_entries.iter().map(|b| b.as_ref()).collect()),
1972                 ),
1973                 &PciRootOSC {},
1974             ],
1975         )
1976         .to_aml_bytes(&mut amls);
1977 
1978         let root_bus = pci_root.lock().get_root_bus();
1979         let addresses = root_bus.lock().get_downstream_devices();
1980         for address in addresses {
1981             if let Some(acpi_path) = pci_root.lock().acpi_path(&address) {
1982                 aml::Device::new(
1983                     (*acpi_path).into(),
1984                     vec![&aml::Name::new("_ADR".into(), &address.acpi_adr())],
1985                 )
1986                 .to_aml_bytes(&mut amls);
1987             }
1988         }
1989 
1990         let pm = Arc::new(Mutex::new(pmresource));
1991         io_bus
1992             .insert(
1993                 pm.clone(),
1994                 pm_iobase as u64,
1995                 devices::acpi::ACPIPM_RESOURCE_LEN as u64,
1996             )
1997             .unwrap();
1998         resume_notify_devices.push(pm.clone());
1999 
2000         Ok((
2001             acpi::AcpiDevResource {
2002                 amls,
2003                 pm_iobase,
2004                 pm,
2005                 sdts,
2006             },
2007             bat_control,
2008         ))
2009     }
2010 
2011     /// Sets up the serial devices for this platform. Returns the serial port number and serial
2012     /// device to be used for stdout
2013     ///
2014     /// # Arguments
2015     ///
2016     /// * - `irq_chip` the IrqChip object for registering irq events
2017     /// * - `io_bus` the I/O bus to add the devices to
2018     /// * - `serial_parmaters` - definitions for how the serial devices should be configured
setup_serial_devices( protection_type: ProtectionType, irq_chip: &mut dyn IrqChip, io_bus: &devices::Bus, serial_parameters: &BTreeMap<(SerialHardware, u8), SerialParameters>, serial_jail: Option<Minijail>, #[cfg(feature = "swap")] swap_controller: Option<&swap::SwapController>, ) -> Result<()>2019     pub fn setup_serial_devices(
2020         protection_type: ProtectionType,
2021         irq_chip: &mut dyn IrqChip,
2022         io_bus: &devices::Bus,
2023         serial_parameters: &BTreeMap<(SerialHardware, u8), SerialParameters>,
2024         serial_jail: Option<Minijail>,
2025         #[cfg(feature = "swap")] swap_controller: Option<&swap::SwapController>,
2026     ) -> Result<()> {
2027         let com_evt_1_3 = devices::IrqEdgeEvent::new().map_err(Error::CreateEvent)?;
2028         let com_evt_2_4 = devices::IrqEdgeEvent::new().map_err(Error::CreateEvent)?;
2029 
2030         arch::add_serial_devices(
2031             protection_type,
2032             io_bus,
2033             com_evt_1_3.get_trigger(),
2034             com_evt_2_4.get_trigger(),
2035             serial_parameters,
2036             serial_jail,
2037             #[cfg(feature = "swap")]
2038             swap_controller,
2039         )
2040         .map_err(Error::CreateSerialDevices)?;
2041 
2042         let source = IrqEventSource {
2043             device_id: Serial::device_id(),
2044             queue_id: 0,
2045             device_name: Serial::debug_label(),
2046         };
2047         irq_chip
2048             .register_edge_irq_event(X86_64_SERIAL_1_3_IRQ, &com_evt_1_3, source.clone())
2049             .map_err(Error::RegisterIrqfd)?;
2050         irq_chip
2051             .register_edge_irq_event(X86_64_SERIAL_2_4_IRQ, &com_evt_2_4, source)
2052             .map_err(Error::RegisterIrqfd)?;
2053 
2054         Ok(())
2055     }
2056 
setup_debugcon_devices( protection_type: ProtectionType, io_bus: &devices::Bus, serial_parameters: &BTreeMap<(SerialHardware, u8), SerialParameters>, debugcon_jail: Option<Minijail>, #[cfg(feature = "swap")] swap_controller: Option<&swap::SwapController>, ) -> Result<()>2057     fn setup_debugcon_devices(
2058         protection_type: ProtectionType,
2059         io_bus: &devices::Bus,
2060         serial_parameters: &BTreeMap<(SerialHardware, u8), SerialParameters>,
2061         debugcon_jail: Option<Minijail>,
2062         #[cfg(feature = "swap")] swap_controller: Option<&swap::SwapController>,
2063     ) -> Result<()> {
2064         for param in serial_parameters.values() {
2065             if param.hardware != SerialHardware::Debugcon {
2066                 continue;
2067             }
2068 
2069             let mut preserved_fds = Vec::new();
2070             let con = param
2071                 .create_serial_device::<Debugcon>(
2072                     protection_type,
2073                     // Debugcon doesn't use the interrupt event
2074                     &Event::new().map_err(Error::CreateEvent)?,
2075                     &mut preserved_fds,
2076                 )
2077                 .map_err(Error::CreateDebugconDevice)?;
2078 
2079             let con: Arc<Mutex<dyn BusDevice>> = match debugcon_jail.as_ref() {
2080                 #[cfg(unix)]
2081                 Some(jail) => {
2082                     let jail_clone = jail.try_clone().map_err(Error::CloneJail)?;
2083                     #[cfg(feature = "seccomp_trace")]
2084                     debug!(
2085                         "seccomp_trace {{\"event\": \"minijail_clone\", \"src_jail_addr\": \"0x{:x}\", \"dst_jail_addr\": \"0x{:x}\"}}",
2086                         read_jail_addr(jail),
2087                         read_jail_addr(&jail_clone)
2088                     );
2089                     Arc::new(Mutex::new(
2090                         ProxyDevice::new(
2091                             con,
2092                             jail_clone,
2093                             preserved_fds,
2094                             #[cfg(feature = "swap")]
2095                             swap_controller,
2096                         )
2097                         .map_err(Error::CreateProxyDevice)?,
2098                     ))
2099                 }
2100                 #[cfg(windows)]
2101                 Some(_) => unreachable!(),
2102                 None => Arc::new(Mutex::new(con)),
2103             };
2104             io_bus
2105                 .insert(con.clone(), param.debugcon_port.into(), 1)
2106                 .map_err(Error::InsertBus)?;
2107         }
2108 
2109         Ok(())
2110     }
2111 }
2112 
2113 #[sorted]
2114 #[derive(Error, Debug)]
2115 pub enum MsrError {
2116     #[error("CPU not support. Only intel CPUs support ITMT.")]
2117     CpuUnSupport,
2118     #[error("msr must be unique: {0}")]
2119     MsrDuplicate(u32),
2120 }
2121 
insert_msr( msr_map: &mut BTreeMap<u32, MsrConfig>, key: u32, msr_config: MsrConfig, ) -> std::result::Result<(), MsrError>2122 fn insert_msr(
2123     msr_map: &mut BTreeMap<u32, MsrConfig>,
2124     key: u32,
2125     msr_config: MsrConfig,
2126 ) -> std::result::Result<(), MsrError> {
2127     if msr_map.insert(key, msr_config).is_some() {
2128         Err(MsrError::MsrDuplicate(key))
2129     } else {
2130         Ok(())
2131     }
2132 }
2133 
insert_msrs( msr_map: &mut BTreeMap<u32, MsrConfig>, msrs: &[(u32, MsrRWType, MsrAction, MsrValueFrom, MsrFilter)], ) -> std::result::Result<(), MsrError>2134 fn insert_msrs(
2135     msr_map: &mut BTreeMap<u32, MsrConfig>,
2136     msrs: &[(u32, MsrRWType, MsrAction, MsrValueFrom, MsrFilter)],
2137 ) -> std::result::Result<(), MsrError> {
2138     for msr in msrs {
2139         insert_msr(
2140             msr_map,
2141             msr.0,
2142             MsrConfig {
2143                 rw_type: msr.1,
2144                 action: msr.2,
2145                 from: msr.3,
2146                 filter: msr.4,
2147             },
2148         )?;
2149     }
2150 
2151     Ok(())
2152 }
2153 
set_enable_pnp_data_msr_config( msr_map: &mut BTreeMap<u32, MsrConfig>, ) -> std::result::Result<(), MsrError>2154 pub fn set_enable_pnp_data_msr_config(
2155     msr_map: &mut BTreeMap<u32, MsrConfig>,
2156 ) -> std::result::Result<(), MsrError> {
2157     let msrs = vec![
2158         (
2159             MSR_IA32_APERF,
2160             MsrRWType::ReadOnly,
2161             MsrAction::MsrPassthrough,
2162             MsrValueFrom::RWFromRunningCPU,
2163             MsrFilter::Default,
2164         ),
2165         (
2166             MSR_IA32_MPERF,
2167             MsrRWType::ReadOnly,
2168             MsrAction::MsrPassthrough,
2169             MsrValueFrom::RWFromRunningCPU,
2170             MsrFilter::Default,
2171         ),
2172     ];
2173 
2174     insert_msrs(msr_map, &msrs)?;
2175 
2176     Ok(())
2177 }
2178 
2179 #[derive(Error, Debug)]
2180 pub enum HybridSupportError {
2181     #[error("Host CPU doesn't support hybrid architecture.")]
2182     UnsupportedHostCpu,
2183 }
2184 
2185 /// The wrapper for CPUID call functions.
2186 pub struct CpuIdCall {
2187     /// __cpuid_count or a fake function for test.
2188     cpuid_count: unsafe fn(u32, u32) -> CpuidResult,
2189     /// __cpuid or a fake function for test.
2190     cpuid: unsafe fn(u32) -> CpuidResult,
2191 }
2192 
2193 impl CpuIdCall {
new( cpuid_count: unsafe fn(u32, u32) -> CpuidResult, cpuid: unsafe fn(u32) -> CpuidResult, ) -> CpuIdCall2194     pub fn new(
2195         cpuid_count: unsafe fn(u32, u32) -> CpuidResult,
2196         cpuid: unsafe fn(u32) -> CpuidResult,
2197     ) -> CpuIdCall {
2198         CpuIdCall { cpuid_count, cpuid }
2199     }
2200 }
2201 
2202 /// Check if host supports hybrid CPU feature. The check include:
2203 ///     1. Check if CPUID.1AH exists. CPUID.1AH is hybrid information enumeration leaf.
2204 ///     2. Check if CPUID.07H.00H:EDX[bit 15] sets. This bit means the processor is
2205 ///        identified as a hybrid part.
2206 ///     3. Check if CPUID.1AH:EAX sets. The hybrid core type is set in EAX.
2207 ///
2208 /// # Arguments
2209 ///
2210 /// * - `cpuid` the wrapped cpuid functions used to get CPUID info.
check_host_hybrid_support(cpuid: &CpuIdCall) -> std::result::Result<(), HybridSupportError>2211 pub fn check_host_hybrid_support(cpuid: &CpuIdCall) -> std::result::Result<(), HybridSupportError> {
2212     // CPUID.0H.EAX returns maximum input value for basic CPUID information.
2213     //
2214     // Safe because we pass 0 for this call and the host supports the
2215     // `cpuid` instruction.
2216     let mut cpuid_entry = unsafe { (cpuid.cpuid)(0x0) };
2217     if cpuid_entry.eax < 0x1A {
2218         return Err(HybridSupportError::UnsupportedHostCpu);
2219     }
2220     // Safe because we pass 0x7 and 0 for this call and the host supports the
2221     // `cpuid` instruction.
2222     cpuid_entry = unsafe { (cpuid.cpuid_count)(0x7, 0) };
2223     if cpuid_entry.edx & 1 << EDX_HYBRID_CPU_SHIFT == 0 {
2224         return Err(HybridSupportError::UnsupportedHostCpu);
2225     }
2226     // From SDM, if a value entered for CPUID.EAX is less than or equal to the
2227     // maximum input value and the leaf is not supported on that processor then
2228     // 0 is returned in all the registers.
2229     // For the CPU with hybrid support, its CPUID.1AH.EAX shouldn't be zero.
2230     //
2231     // Safe because we pass 0 for this call and the host supports the
2232     // `cpuid` instruction.
2233     cpuid_entry = unsafe { (cpuid.cpuid)(0x1A) };
2234     if cpuid_entry.eax == 0 {
2235         return Err(HybridSupportError::UnsupportedHostCpu);
2236     }
2237     Ok(())
2238 }
2239 
2240 #[cfg(test)]
2241 mod tests {
2242     use std::mem::size_of;
2243 
2244     use super::*;
2245 
2246     const TEST_MEMORY_SIZE: u64 = 2 * GB;
2247 
setup()2248     fn setup() {
2249         let pcie_ecam = Some(AddressRange::from_start_and_size(3 * GB, 256 * MB).unwrap());
2250         let pci_start = Some(2 * GB);
2251         init_low_memory_layout(pcie_ecam, pci_start);
2252     }
2253 
2254     #[test]
regions_lt_4gb_nobios()2255     fn regions_lt_4gb_nobios() {
2256         setup();
2257         let regions = arch_memory_regions(512 * MB, /* bios_size */ None);
2258         assert_eq!(1, regions.len());
2259         assert_eq!(GuestAddress(START_OF_RAM_32BITS), regions[0].0);
2260         assert_eq!(1u64 << 29, regions[0].1);
2261     }
2262 
2263     #[test]
regions_gt_4gb_nobios()2264     fn regions_gt_4gb_nobios() {
2265         setup();
2266         let size = 4 * GB + 0x8000;
2267         let regions = arch_memory_regions(size, /* bios_size */ None);
2268         assert_eq!(2, regions.len());
2269         assert_eq!(GuestAddress(START_OF_RAM_32BITS), regions[0].0);
2270         assert_eq!(GuestAddress(4 * GB), regions[1].0);
2271         assert_eq!(4 * GB + 0x8000, regions[0].1 + regions[1].1);
2272     }
2273 
2274     #[test]
regions_lt_4gb_bios()2275     fn regions_lt_4gb_bios() {
2276         setup();
2277         let bios_len = 1 * MB;
2278         let regions = arch_memory_regions(512 * MB, Some(bios_len));
2279         assert_eq!(2, regions.len());
2280         assert_eq!(GuestAddress(START_OF_RAM_32BITS), regions[0].0);
2281         assert_eq!(512 * MB, regions[0].1);
2282         assert_eq!(
2283             GuestAddress(FIRST_ADDR_PAST_32BITS - bios_len),
2284             regions[1].0
2285         );
2286         assert_eq!(bios_len, regions[1].1);
2287     }
2288 
2289     #[test]
regions_gt_4gb_bios()2290     fn regions_gt_4gb_bios() {
2291         setup();
2292         let bios_len = 1 * MB;
2293         let regions = arch_memory_regions(4 * GB + 0x8000, Some(bios_len));
2294         assert_eq!(3, regions.len());
2295         assert_eq!(GuestAddress(START_OF_RAM_32BITS), regions[0].0);
2296         assert_eq!(
2297             GuestAddress(FIRST_ADDR_PAST_32BITS - bios_len),
2298             regions[1].0
2299         );
2300         assert_eq!(bios_len, regions[1].1);
2301         assert_eq!(GuestAddress(4 * GB), regions[2].0);
2302     }
2303 
2304     #[test]
regions_eq_4gb_nobios()2305     fn regions_eq_4gb_nobios() {
2306         setup();
2307         // Test with exact size of 4GB - the overhead.
2308         let regions = arch_memory_regions(
2309             TEST_MEMORY_SIZE - START_OF_RAM_32BITS,
2310             /* bios_size */ None,
2311         );
2312         dbg!(&regions);
2313         assert_eq!(1, regions.len());
2314         assert_eq!(GuestAddress(START_OF_RAM_32BITS), regions[0].0);
2315         assert_eq!(TEST_MEMORY_SIZE - START_OF_RAM_32BITS, regions[0].1);
2316     }
2317 
2318     #[test]
regions_eq_4gb_bios()2319     fn regions_eq_4gb_bios() {
2320         setup();
2321         // Test with exact size of 4GB - the overhead.
2322         let bios_len = 1 * MB;
2323         let regions = arch_memory_regions(TEST_MEMORY_SIZE - START_OF_RAM_32BITS, Some(bios_len));
2324         assert_eq!(2, regions.len());
2325         assert_eq!(GuestAddress(START_OF_RAM_32BITS), regions[0].0);
2326         assert_eq!(TEST_MEMORY_SIZE - START_OF_RAM_32BITS, regions[0].1);
2327         assert_eq!(
2328             GuestAddress(FIRST_ADDR_PAST_32BITS - bios_len),
2329             regions[1].0
2330         );
2331         assert_eq!(bios_len, regions[1].1);
2332     }
2333 
2334     #[test]
check_pci_mmio_layout()2335     fn check_pci_mmio_layout() {
2336         setup();
2337 
2338         assert_eq!(read_pci_mmio_before_32bit().start, 2 * GB);
2339         assert_eq!(read_pcie_cfg_mmio().start, 3 * GB);
2340         assert_eq!(read_pcie_cfg_mmio().len().unwrap(), 256 * MB);
2341     }
2342 
2343     #[test]
2344     #[cfg(feature = "direct")]
2345     #[ignore] // TODO(b/236253615): Fix and re-enable this test.
end_addr_before_32bits()2346     fn end_addr_before_32bits() {
2347         setup();
2348         // On volteer, type16 (coreboot) region is at 0x00000000769f3000-0x0000000076ffffff.
2349         // On brya, type16 region is at 0x0000000076876000-0x00000000803fffff
2350         let brya_type16_address = 0x7687_6000;
2351         assert!(
2352             read_pci_mmio_before_32bit().start < brya_type16_address,
2353             "{} < {}",
2354             read_pci_mmio_before_32bit().start,
2355             brya_type16_address
2356         );
2357     }
2358 
2359     #[test]
check_32bit_gap_size_alignment()2360     fn check_32bit_gap_size_alignment() {
2361         setup();
2362         // pci_low_start is 256 MB aligned to be friendly for MTRR mappings.
2363         assert_eq!(read_pci_mmio_before_32bit().start % (256 * MB), 0);
2364     }
2365 
2366     #[test]
write_setup_data_empty()2367     fn write_setup_data_empty() {
2368         let mem = GuestMemory::new(&[(GuestAddress(0), 0x2_0000)]).unwrap();
2369         let setup_data = [];
2370         let setup_data_addr = write_setup_data(
2371             &mem,
2372             GuestAddress(0x1000),
2373             GuestAddress(0x2000),
2374             &setup_data,
2375         )
2376         .expect("write_setup_data");
2377         assert_eq!(setup_data_addr, None);
2378     }
2379 
2380     #[test]
write_setup_data_two_of_them()2381     fn write_setup_data_two_of_them() {
2382         let mem = GuestMemory::new(&[(GuestAddress(0), 0x2_0000)]).unwrap();
2383 
2384         let entry1_addr = GuestAddress(0x1000);
2385         let entry1_next_addr = entry1_addr;
2386         let entry1_len_addr = entry1_addr.checked_add(12).unwrap();
2387         let entry1_data_addr = entry1_addr.checked_add(16).unwrap();
2388         let entry1_data = [0x55u8; 13];
2389         let entry1_size = (size_of::<setup_data_hdr>() + entry1_data.len()) as u64;
2390         let entry1_align = 3;
2391 
2392         let entry2_addr = GuestAddress(entry1_addr.offset() + entry1_size + entry1_align);
2393         let entry2_next_addr = entry2_addr;
2394         let entry2_len_addr = entry2_addr.checked_add(12).unwrap();
2395         let entry2_data_addr = entry2_addr.checked_add(16).unwrap();
2396         let entry2_data = [0xAAu8; 9];
2397 
2398         let setup_data = [
2399             SetupData {
2400                 data: entry1_data.to_vec(),
2401                 type_: SetupDataType::Dtb,
2402             },
2403             SetupData {
2404                 data: entry2_data.to_vec(),
2405                 type_: SetupDataType::Dtb,
2406             },
2407         ];
2408 
2409         let setup_data_head_addr = write_setup_data(
2410             &mem,
2411             GuestAddress(0x1000),
2412             GuestAddress(0x2000),
2413             &setup_data,
2414         )
2415         .expect("write_setup_data");
2416         assert_eq!(setup_data_head_addr, Some(entry1_addr));
2417 
2418         assert_eq!(
2419             mem.read_obj_from_addr::<u64>(entry1_next_addr).unwrap(),
2420             entry2_addr.offset()
2421         );
2422         assert_eq!(
2423             mem.read_obj_from_addr::<u32>(entry1_len_addr).unwrap(),
2424             entry1_data.len() as u32
2425         );
2426         assert_eq!(
2427             mem.read_obj_from_addr::<[u8; 13]>(entry1_data_addr)
2428                 .unwrap(),
2429             entry1_data
2430         );
2431 
2432         assert_eq!(mem.read_obj_from_addr::<u64>(entry2_next_addr).unwrap(), 0);
2433         assert_eq!(
2434             mem.read_obj_from_addr::<u32>(entry2_len_addr).unwrap(),
2435             entry2_data.len() as u32
2436         );
2437         assert_eq!(
2438             mem.read_obj_from_addr::<[u8; 9]>(entry2_data_addr).unwrap(),
2439             entry2_data
2440         );
2441     }
2442 }
2443