• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2017 The ChromiumOS Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 //! x86 architecture support.
6 
7 #![cfg(target_arch = "x86_64")]
8 
9 mod fdt;
10 
11 const SETUP_DTB: u32 = 2;
12 const SETUP_RNG_SEED: u32 = 9;
13 
14 #[allow(dead_code)]
15 #[allow(non_upper_case_globals)]
16 #[allow(non_camel_case_types)]
17 #[allow(non_snake_case)]
18 pub mod bootparam;
19 
20 #[allow(dead_code)]
21 #[allow(non_upper_case_globals)]
22 mod msr_index;
23 
24 #[allow(dead_code)]
25 #[allow(non_upper_case_globals)]
26 #[allow(non_camel_case_types)]
27 #[allow(clippy::all)]
28 mod mpspec;
29 
30 pub mod acpi;
31 mod bzimage;
32 pub mod cpuid;
33 mod gdt;
34 pub mod interrupts;
35 pub mod mptable;
36 pub mod regs;
37 pub mod smbios;
38 
39 use std::arch::x86_64::CpuidResult;
40 use std::collections::BTreeMap;
41 use std::ffi::CStr;
42 use std::ffi::CString;
43 use std::fs::File;
44 use std::io;
45 use std::mem;
46 use std::path::PathBuf;
47 use std::sync::mpsc;
48 use std::sync::Arc;
49 
50 use acpi_tables::aml;
51 use acpi_tables::aml::Aml;
52 use acpi_tables::sdt::SDT;
53 use anyhow::Context;
54 use arch::get_serial_cmdline;
55 use arch::serial::SerialDeviceInfo;
56 use arch::CpuSet;
57 use arch::DtbOverlay;
58 use arch::GetSerialCmdlineError;
59 use arch::RunnableLinuxVm;
60 use arch::VmComponents;
61 use arch::VmImage;
62 #[cfg(feature = "seccomp_trace")]
63 use base::debug;
64 use base::warn;
65 #[cfg(any(target_os = "android", target_os = "linux"))]
66 use base::AsRawDescriptors;
67 use base::Event;
68 use base::FileGetLen;
69 use base::FileReadWriteAtVolatile;
70 use base::SendTube;
71 use base::Tube;
72 use base::TubeError;
73 use chrono::Utc;
74 pub use cpuid::adjust_cpuid;
75 pub use cpuid::CpuIdContext;
76 use devices::acpi::PM_WAKEUP_GPIO;
77 use devices::Bus;
78 use devices::BusDevice;
79 use devices::BusDeviceObj;
80 use devices::BusResumeDevice;
81 use devices::BusType;
82 use devices::Debugcon;
83 use devices::FwCfgParameters;
84 use devices::IrqChip;
85 use devices::IrqChipX86_64;
86 use devices::IrqEventSource;
87 use devices::PciAddress;
88 use devices::PciConfigIo;
89 use devices::PciConfigMmio;
90 use devices::PciDevice;
91 use devices::PciInterruptPin;
92 use devices::PciRoot;
93 use devices::PciRootCommand;
94 use devices::PciVirtualConfigMmio;
95 use devices::Pflash;
96 #[cfg(any(target_os = "android", target_os = "linux"))]
97 use devices::ProxyDevice;
98 use devices::Serial;
99 use devices::SerialHardware;
100 use devices::SerialParameters;
101 #[cfg(any(target_os = "android", target_os = "linux"))]
102 use devices::VirtualPmc;
103 use devices::FW_CFG_BASE_PORT;
104 use devices::FW_CFG_MAX_FILE_SLOTS;
105 use devices::FW_CFG_WIDTH;
106 #[cfg(feature = "gdb")]
107 use gdbstub_arch::x86::reg::id::X86_64CoreRegId;
108 #[cfg(feature = "gdb")]
109 use gdbstub_arch::x86::reg::X86SegmentRegs;
110 #[cfg(feature = "gdb")]
111 use gdbstub_arch::x86::reg::X86_64CoreRegs;
112 #[cfg(feature = "gdb")]
113 use gdbstub_arch::x86::reg::X87FpuInternalRegs;
114 #[cfg(feature = "gdb")]
115 use hypervisor::x86_64::Regs;
116 #[cfg(feature = "gdb")]
117 use hypervisor::x86_64::Sregs;
118 use hypervisor::CpuConfigX86_64;
119 use hypervisor::Hypervisor;
120 use hypervisor::HypervisorX86_64;
121 use hypervisor::ProtectionType;
122 use hypervisor::VcpuInitX86_64;
123 use hypervisor::VcpuX86_64;
124 use hypervisor::Vm;
125 use hypervisor::VmCap;
126 use hypervisor::VmX86_64;
127 #[cfg(feature = "seccomp_trace")]
128 use jail::read_jail_addr;
129 #[cfg(windows)]
130 use jail::FakeMinijailStub as Minijail;
131 #[cfg(any(target_os = "android", target_os = "linux"))]
132 use minijail::Minijail;
133 use once_cell::sync::OnceCell;
134 use rand::rngs::OsRng;
135 use rand::RngCore;
136 use remain::sorted;
137 use resources::AddressRange;
138 use resources::SystemAllocator;
139 use resources::SystemAllocatorConfig;
140 #[cfg(any(target_os = "android", target_os = "linux"))]
141 use sync::Condvar;
142 use sync::Mutex;
143 use thiserror::Error;
144 use vm_control::BatControl;
145 use vm_control::BatteryType;
146 use vm_memory::GuestAddress;
147 use vm_memory::GuestMemory;
148 use vm_memory::GuestMemoryError;
149 use vm_memory::MemoryRegionOptions;
150 use zerocopy::AsBytes;
151 use zerocopy::FromBytes;
152 use zerocopy::FromZeroes;
153 
154 use crate::bootparam::boot_params;
155 use crate::cpuid::EDX_HYBRID_CPU_SHIFT;
156 
157 #[sorted]
158 #[derive(Error, Debug)]
159 pub enum Error {
160     #[error("error allocating a single gpe")]
161     AllocateGpe,
162     #[error("error allocating IO resource: {0}")]
163     AllocateIOResouce(resources::Error),
164     #[error("error allocating a single irq")]
165     AllocateIrq,
166     #[error("unable to clone an Event: {0}")]
167     CloneEvent(base::Error),
168     #[error("failed to clone IRQ chip: {0}")]
169     CloneIrqChip(base::Error),
170     #[cfg(any(target_os = "android", target_os = "linux"))]
171     #[error("failed to clone jail: {0}")]
172     CloneJail(minijail::Error),
173     #[error("unable to clone a Tube: {0}")]
174     CloneTube(TubeError),
175     #[error("the given kernel command line was invalid: {0}")]
176     Cmdline(kernel_cmdline::Error),
177     #[error("failed to configure hotplugged pci device: {0}")]
178     ConfigurePciDevice(arch::DeviceRegistrationError),
179     #[error("failed to configure segment registers: {0}")]
180     ConfigureSegments(regs::Error),
181     #[error("error configuring the system")]
182     ConfigureSystem,
183     #[error("unable to create ACPI tables")]
184     CreateAcpi,
185     #[error("unable to create battery devices: {0}")]
186     CreateBatDevices(arch::DeviceRegistrationError),
187     #[error("could not create debugcon device: {0}")]
188     CreateDebugconDevice(devices::SerialError),
189     #[error("unable to make an Event: {0}")]
190     CreateEvent(base::Error),
191     #[error("failed to create fdt: {0}")]
192     CreateFdt(cros_fdt::Error),
193     #[error("failed to create fw_cfg device: {0}")]
194     CreateFwCfgDevice(devices::FwCfgError),
195     #[error("failed to create IOAPIC device: {0}")]
196     CreateIoapicDevice(base::Error),
197     #[error("failed to create a PCI root hub: {0}")]
198     CreatePciRoot(arch::DeviceRegistrationError),
199     #[error("unable to create PIT: {0}")]
200     CreatePit(base::Error),
201     #[error("unable to make PIT device: {0}")]
202     CreatePitDevice(devices::PitError),
203     #[cfg(any(target_os = "android", target_os = "linux"))]
204     #[error("unable to create proxy device: {0}")]
205     CreateProxyDevice(devices::ProxyError),
206     #[error("unable to create serial devices: {0}")]
207     CreateSerialDevices(arch::DeviceRegistrationError),
208     #[error("failed to create socket: {0}")]
209     CreateSocket(io::Error),
210     #[error("failed to create VCPU: {0}")]
211     CreateVcpu(base::Error),
212     #[error("failed to create Virtio MMIO bus: {0}")]
213     CreateVirtioMmioBus(arch::DeviceRegistrationError),
214     #[error("invalid e820 setup params")]
215     E820Configuration,
216     #[error("failed to enable singlestep execution: {0}")]
217     EnableSinglestep(base::Error),
218     #[error("failed to enable split irqchip: {0}")]
219     EnableSplitIrqchip(base::Error),
220     #[error("failed to get serial cmdline: {0}")]
221     GetSerialCmdline(GetSerialCmdlineError),
222     #[error("failed to insert device onto bus: {0}")]
223     InsertBus(devices::BusError),
224     #[error("the kernel extends past the end of RAM")]
225     InvalidCpuConfig,
226     #[error("invalid CPU config parameters")]
227     KernelOffsetPastEnd,
228     #[error("error loading bios: {0}")]
229     LoadBios(io::Error),
230     #[error("error loading kernel bzImage: {0}")]
231     LoadBzImage(bzimage::Error),
232     #[error("error loading command line: {0}")]
233     LoadCmdline(kernel_loader::Error),
234     #[error("error loading initrd: {0}")]
235     LoadInitrd(arch::LoadImageError),
236     #[error("error loading Kernel: {0}")]
237     LoadKernel(kernel_loader::Error),
238     #[error("error loading pflash: {0}")]
239     LoadPflash(io::Error),
240     #[error("error translating address: Page not present")]
241     PageNotPresent,
242     #[error("error reading guest memory {0}")]
243     ReadingGuestMemory(vm_memory::GuestMemoryError),
244     #[error("single register read not supported on x86_64")]
245     ReadRegIsUnsupported,
246     #[error("error reading CPU registers {0}")]
247     ReadRegs(base::Error),
248     #[error("error registering an IrqFd: {0}")]
249     RegisterIrqfd(base::Error),
250     #[error("error registering virtual socket device: {0}")]
251     RegisterVsock(arch::DeviceRegistrationError),
252     #[error("error reserved pcie config mmio")]
253     ReservePcieCfgMmio(resources::Error),
254     #[error("failed to set a hardware breakpoint: {0}")]
255     SetHwBreakpoint(base::Error),
256     #[error("failed to set identity map addr: {0}")]
257     SetIdentityMapAddr(base::Error),
258     #[error("failed to set interrupts: {0}")]
259     SetLint(interrupts::Error),
260     #[error("failed to set tss addr: {0}")]
261     SetTssAddr(base::Error),
262     #[error("failed to set up cmos: {0}")]
263     SetupCmos(anyhow::Error),
264     #[error("failed to set up cpuid: {0}")]
265     SetupCpuid(cpuid::Error),
266     #[error("setup data too large")]
267     SetupDataTooLarge,
268     #[error("failed to set up FPU: {0}")]
269     SetupFpu(base::Error),
270     #[error("failed to set up guest memory: {0}")]
271     SetupGuestMemory(GuestMemoryError),
272     #[error("failed to set up mptable: {0}")]
273     SetupMptable(mptable::Error),
274     #[error("failed to set up MSRs: {0}")]
275     SetupMsrs(base::Error),
276     #[error("failed to set up page tables: {0}")]
277     SetupPageTables(regs::Error),
278     #[error("failed to set up pflash: {0}")]
279     SetupPflash(anyhow::Error),
280     #[error("failed to set up registers: {0}")]
281     SetupRegs(regs::Error),
282     #[error("failed to set up SMBIOS: {0}")]
283     SetupSmbios(smbios::Error),
284     #[error("failed to set up sregs: {0}")]
285     SetupSregs(base::Error),
286     #[error("failed to translate virtual address")]
287     TranslatingVirtAddr,
288     #[error("protected VMs not supported on x86_64")]
289     UnsupportedProtectionType,
290     #[error("single register write not supported on x86_64")]
291     WriteRegIsUnsupported,
292     #[error("error writing CPU registers {0}")]
293     WriteRegs(base::Error),
294     #[error("error writing guest memory {0}")]
295     WritingGuestMemory(GuestMemoryError),
296     #[error("error writing setup_data: {0}")]
297     WritingSetupData(GuestMemoryError),
298     #[error("the zero page extends past the end of guest_mem")]
299     ZeroPagePastRamEnd,
300     #[error("error writing the zero page of guest memory")]
301     ZeroPageSetup,
302 }
303 
304 pub type Result<T> = std::result::Result<T, Error>;
305 
306 pub struct X8664arch;
307 
308 // Like `bootparam::setup_data` without the incomplete array field at the end, which allows us to
309 // safely implement Copy, Clone
310 #[repr(C)]
311 #[derive(Copy, Clone, Default, FromZeroes, FromBytes, AsBytes)]
312 struct setup_data_hdr {
313     pub next: u64,
314     pub type_: u32,
315     pub len: u32,
316 }
317 
318 #[repr(u32)]
319 #[derive(Copy, Clone, Debug, PartialEq, Eq)]
320 pub enum SetupDataType {
321     Dtb = SETUP_DTB,
322     RngSeed = SETUP_RNG_SEED,
323 }
324 
325 /// A single entry to be inserted in the bootparam `setup_data` linked list.
326 pub struct SetupData {
327     pub data: Vec<u8>,
328     pub type_: SetupDataType,
329 }
330 
331 enum E820Type {
332     Ram = 0x01,
333     Reserved = 0x2,
334 }
335 
336 const MB: u64 = 1 << 20;
337 const GB: u64 = 1 << 30;
338 
339 pub const BOOT_STACK_POINTER: u64 = 0x8000;
340 const START_OF_RAM_32BITS: u64 = 0;
341 const FIRST_ADDR_PAST_32BITS: u64 = 1 << 32;
342 // Linux (with 4-level paging) has a physical memory limit of 46 bits (64 TiB).
343 const HIGH_MMIO_MAX_END: u64 = (1u64 << 46) - 1;
344 pub const KERNEL_64BIT_ENTRY_OFFSET: u64 = 0x200;
345 pub const ZERO_PAGE_OFFSET: u64 = 0x7000;
346 // Set BIOS max size to 16M: this is used only when `unrestricted guest` is disabled
347 const BIOS_MAX_SIZE: u64 = 0x1000000;
348 
349 pub const KERNEL_START_OFFSET: u64 = 0x20_0000;
350 const CMDLINE_OFFSET: u64 = 0x2_0000;
351 const CMDLINE_MAX_SIZE: u64 = 0x800; // including terminating zero
352 const SETUP_DATA_START: u64 = CMDLINE_OFFSET + CMDLINE_MAX_SIZE;
353 const SETUP_DATA_END: u64 = ACPI_HI_RSDP_WINDOW_BASE;
354 const X86_64_SERIAL_1_3_IRQ: u32 = 4;
355 const X86_64_SERIAL_2_4_IRQ: u32 = 3;
356 // X86_64_SCI_IRQ is used to fill the ACPI FACP table.
357 // The sci_irq number is better to be a legacy
358 // IRQ number which is less than 16(actually most of the
359 // platforms have fixed IRQ number 9). So we can
360 // reserve the IRQ number 5 for SCI and let the
361 // the other devices starts from next.
362 pub const X86_64_SCI_IRQ: u32 = 5;
363 // The CMOS RTC uses IRQ 8; start allocating IRQs at 9.
364 pub const X86_64_IRQ_BASE: u32 = 9;
365 const ACPI_HI_RSDP_WINDOW_BASE: u64 = 0x000E_0000;
366 
367 #[derive(Debug, PartialEq, Eq)]
368 pub enum CpuManufacturer {
369     Intel,
370     Amd,
371     Unknown,
372 }
373 
get_cpu_manufacturer() -> CpuManufacturer374 pub fn get_cpu_manufacturer() -> CpuManufacturer {
375     cpuid::cpu_manufacturer()
376 }
377 
378 // Memory layout below 4G
379 struct LowMemoryLayout {
380     // the pci mmio range below 4G
381     pci_mmio: AddressRange,
382     // the pcie cfg mmio range
383     pcie_cfg_mmio: AddressRange,
384 }
385 
386 static LOW_MEMORY_LAYOUT: OnceCell<LowMemoryLayout> = OnceCell::new();
387 
init_low_memory_layout(pcie_ecam: Option<AddressRange>, pci_low_start: Option<u64>)388 pub fn init_low_memory_layout(pcie_ecam: Option<AddressRange>, pci_low_start: Option<u64>) {
389     LOW_MEMORY_LAYOUT.get_or_init(|| {
390         // Make sure it align to 256MB for MTRR convenient
391         const MEM_32BIT_GAP_SIZE: u64 = 768 * MB;
392         // Reserved memory for nand_bios/LAPIC/IOAPIC/HPET/.....
393         const RESERVED_MEM_SIZE: u64 = 0x800_0000;
394         const PCI_MMIO_END: u64 = FIRST_ADDR_PAST_32BITS - RESERVED_MEM_SIZE - 1;
395         // Reserve 64MB for pcie enhanced configuration
396         const DEFAULT_PCIE_CFG_MMIO_SIZE: u64 = 0x400_0000;
397         const DEFAULT_PCIE_CFG_MMIO_END: u64 = FIRST_ADDR_PAST_32BITS - RESERVED_MEM_SIZE - 1;
398         const DEFAULT_PCIE_CFG_MMIO_START: u64 =
399             DEFAULT_PCIE_CFG_MMIO_END - DEFAULT_PCIE_CFG_MMIO_SIZE + 1;
400         const DEFAULT_PCIE_CFG_MMIO: AddressRange = AddressRange {
401             start: DEFAULT_PCIE_CFG_MMIO_START,
402             end: DEFAULT_PCIE_CFG_MMIO_END,
403         };
404 
405         let pcie_cfg_mmio = pcie_ecam.unwrap_or(DEFAULT_PCIE_CFG_MMIO);
406 
407         let pci_mmio = if let Some(pci_low) = pci_low_start {
408             AddressRange {
409                 start: pci_low,
410                 end: PCI_MMIO_END,
411             }
412         } else {
413             AddressRange {
414                 start: pcie_cfg_mmio
415                     .start
416                     .min(FIRST_ADDR_PAST_32BITS - MEM_32BIT_GAP_SIZE),
417                 end: PCI_MMIO_END,
418             }
419         };
420 
421         LowMemoryLayout {
422             pci_mmio,
423             pcie_cfg_mmio,
424         }
425     });
426 }
427 
read_pci_mmio_before_32bit() -> AddressRange428 pub fn read_pci_mmio_before_32bit() -> AddressRange {
429     LOW_MEMORY_LAYOUT.get().unwrap().pci_mmio
430 }
read_pcie_cfg_mmio() -> AddressRange431 pub fn read_pcie_cfg_mmio() -> AddressRange {
432     LOW_MEMORY_LAYOUT.get().unwrap().pcie_cfg_mmio
433 }
434 
435 /// The x86 reset vector for i386+ and x86_64 puts the processor into an "unreal mode" where it
436 /// can access the last 1 MB of the 32-bit address space in 16-bit mode, and starts the instruction
437 /// pointer at the effective physical address 0xFFFF_FFF0.
bios_start(bios_size: u64) -> GuestAddress438 fn bios_start(bios_size: u64) -> GuestAddress {
439     GuestAddress(FIRST_ADDR_PAST_32BITS - bios_size)
440 }
441 
identity_map_addr_start() -> GuestAddress442 fn identity_map_addr_start() -> GuestAddress {
443     // Set Identity map address 4 pages before the max BIOS size
444     GuestAddress(FIRST_ADDR_PAST_32BITS - BIOS_MAX_SIZE - 4 * 0x1000)
445 }
446 
tss_addr_start() -> GuestAddress447 fn tss_addr_start() -> GuestAddress {
448     // Set TSS address one page after identity map address
449     GuestAddress(identity_map_addr_start().offset() + 0x1000)
450 }
451 
tss_addr_end() -> GuestAddress452 fn tss_addr_end() -> GuestAddress {
453     // Set TSS address section to have 3 pages
454     GuestAddress(tss_addr_start().offset() + 0x3000)
455 }
456 
configure_system( guest_mem: &GuestMemory, kernel_addr: GuestAddress, cmdline_addr: GuestAddress, cmdline_size: usize, setup_data: Option<GuestAddress>, initrd: Option<(GuestAddress, usize)>, mut params: boot_params, ) -> Result<()>457 fn configure_system(
458     guest_mem: &GuestMemory,
459     kernel_addr: GuestAddress,
460     cmdline_addr: GuestAddress,
461     cmdline_size: usize,
462     setup_data: Option<GuestAddress>,
463     initrd: Option<(GuestAddress, usize)>,
464     mut params: boot_params,
465 ) -> Result<()> {
466     const EBDA_START: u64 = 0x0009_fc00;
467     const KERNEL_BOOT_FLAG_MAGIC: u16 = 0xaa55;
468     const KERNEL_HDR_MAGIC: u32 = 0x5372_6448;
469     const KERNEL_LOADER_OTHER: u8 = 0xff;
470     const KERNEL_MIN_ALIGNMENT_BYTES: u32 = 0x100_0000; // Must be non-zero.
471 
472     params.hdr.type_of_loader = KERNEL_LOADER_OTHER;
473     params.hdr.boot_flag = KERNEL_BOOT_FLAG_MAGIC;
474     params.hdr.header = KERNEL_HDR_MAGIC;
475     params.hdr.cmd_line_ptr = cmdline_addr.offset() as u32;
476     params.ext_cmd_line_ptr = (cmdline_addr.offset() >> 32) as u32;
477     params.hdr.cmdline_size = cmdline_size as u32;
478     params.hdr.kernel_alignment = KERNEL_MIN_ALIGNMENT_BYTES;
479     if let Some(setup_data) = setup_data {
480         params.hdr.setup_data = setup_data.offset();
481     }
482     if let Some((initrd_addr, initrd_size)) = initrd {
483         params.hdr.ramdisk_image = initrd_addr.offset() as u32;
484         params.hdr.ramdisk_size = initrd_size as u32;
485     }
486 
487     add_e820_entry(
488         &mut params,
489         AddressRange {
490             start: START_OF_RAM_32BITS,
491             end: EBDA_START - 1,
492         },
493         E820Type::Ram,
494     )?;
495 
496     // GuestMemory::end_addr() returns the first address past the end, so subtract 1 to get the
497     // inclusive end.
498     let guest_mem_end = guest_mem.end_addr().offset() - 1;
499     let ram_below_4g = AddressRange {
500         start: kernel_addr.offset(),
501         end: guest_mem_end.min(read_pci_mmio_before_32bit().start - 1),
502     };
503     let ram_above_4g = AddressRange {
504         start: FIRST_ADDR_PAST_32BITS,
505         end: guest_mem_end,
506     };
507     add_e820_entry(&mut params, ram_below_4g, E820Type::Ram)?;
508     if !ram_above_4g.is_empty() {
509         add_e820_entry(&mut params, ram_above_4g, E820Type::Ram)?
510     }
511 
512     let pcie_cfg_mmio_range = read_pcie_cfg_mmio();
513     add_e820_entry(&mut params, pcie_cfg_mmio_range, E820Type::Reserved)?;
514 
515     add_e820_entry(
516         &mut params,
517         X8664arch::get_pcie_vcfg_mmio_range(guest_mem, &pcie_cfg_mmio_range),
518         E820Type::Reserved,
519     )?;
520 
521     // Reserve memory section for Identity map and TSS
522     add_e820_entry(
523         &mut params,
524         AddressRange {
525             start: identity_map_addr_start().offset(),
526             end: tss_addr_end().offset() - 1,
527         },
528         E820Type::Reserved,
529     )?;
530 
531     let zero_page_addr = GuestAddress(ZERO_PAGE_OFFSET);
532     if !guest_mem.is_valid_range(zero_page_addr, mem::size_of::<boot_params>() as u64) {
533         return Err(Error::ZeroPagePastRamEnd);
534     }
535 
536     guest_mem
537         .write_obj_at_addr(params, zero_page_addr)
538         .map_err(|_| Error::ZeroPageSetup)?;
539 
540     Ok(())
541 }
542 
543 /// Write setup_data entries in guest memory and link them together with the `next` field.
544 ///
545 /// Returns the guest address of the first entry in the setup_data list, if any.
write_setup_data( guest_mem: &GuestMemory, setup_data_start: GuestAddress, setup_data_end: GuestAddress, setup_data: &[SetupData], ) -> Result<Option<GuestAddress>>546 fn write_setup_data(
547     guest_mem: &GuestMemory,
548     setup_data_start: GuestAddress,
549     setup_data_end: GuestAddress,
550     setup_data: &[SetupData],
551 ) -> Result<Option<GuestAddress>> {
552     let mut setup_data_list_head = None;
553 
554     // Place the first setup_data at the first 64-bit aligned offset following setup_data_start.
555     let mut setup_data_addr = setup_data_start.align(8).ok_or(Error::SetupDataTooLarge)?;
556 
557     let mut entry_iter = setup_data.iter().peekable();
558     while let Some(entry) = entry_iter.next() {
559         if setup_data_list_head.is_none() {
560             setup_data_list_head = Some(setup_data_addr);
561         }
562 
563         // Ensure the entry (header plus data) fits into guest memory.
564         let entry_size = (mem::size_of::<setup_data_hdr>() + entry.data.len()) as u64;
565         let entry_end = setup_data_addr
566             .checked_add(entry_size)
567             .ok_or(Error::SetupDataTooLarge)?;
568 
569         if entry_end >= setup_data_end {
570             return Err(Error::SetupDataTooLarge);
571         }
572 
573         let next_setup_data_addr = if entry_iter.peek().is_some() {
574             // Place the next setup_data at a 64-bit aligned address.
575             setup_data_addr
576                 .checked_add(entry_size)
577                 .and_then(|addr| addr.align(8))
578                 .ok_or(Error::SetupDataTooLarge)?
579         } else {
580             // This is the final entry. Terminate the list with next == 0.
581             GuestAddress(0)
582         };
583 
584         let hdr = setup_data_hdr {
585             next: next_setup_data_addr.offset(),
586             type_: entry.type_ as u32,
587             len: entry
588                 .data
589                 .len()
590                 .try_into()
591                 .map_err(|_| Error::SetupDataTooLarge)?,
592         };
593 
594         guest_mem
595             .write_obj_at_addr(hdr, setup_data_addr)
596             .map_err(Error::WritingSetupData)?;
597         guest_mem
598             .write_all_at_addr(
599                 &entry.data,
600                 setup_data_addr.unchecked_add(mem::size_of::<setup_data_hdr>() as u64),
601             )
602             .map_err(Error::WritingSetupData)?;
603 
604         setup_data_addr = next_setup_data_addr;
605     }
606 
607     Ok(setup_data_list_head)
608 }
609 
610 /// Generate a SETUP_RNG_SEED SetupData with random seed data.
setup_data_rng_seed() -> SetupData611 fn setup_data_rng_seed() -> SetupData {
612     let mut data = vec![0u8; 256];
613     OsRng.fill_bytes(&mut data);
614     SetupData {
615         data,
616         type_: SetupDataType::RngSeed,
617     }
618 }
619 
620 /// Add an e820 region to the e820 map.
621 /// Returns Ok(()) if successful, or an error if there is no space left in the map.
add_e820_entry(params: &mut boot_params, range: AddressRange, mem_type: E820Type) -> Result<()>622 fn add_e820_entry(params: &mut boot_params, range: AddressRange, mem_type: E820Type) -> Result<()> {
623     if params.e820_entries >= params.e820_table.len() as u8 {
624         return Err(Error::E820Configuration);
625     }
626 
627     let size = range.len().ok_or(Error::E820Configuration)?;
628 
629     params.e820_table[params.e820_entries as usize].addr = range.start;
630     params.e820_table[params.e820_entries as usize].size = size;
631     params.e820_table[params.e820_entries as usize].type_ = mem_type as u32;
632     params.e820_entries += 1;
633 
634     Ok(())
635 }
636 
637 /// Returns a Vec of the valid memory addresses.
638 /// These should be used to configure the GuestMemory structure for the platform.
639 /// For x86_64 all addresses are valid from the start of the kernel except a
640 /// carve out at the end of 32bit address space.
arch_memory_regions( size: u64, bios_size: Option<u64>, ) -> Vec<(GuestAddress, u64, MemoryRegionOptions)>641 pub fn arch_memory_regions(
642     size: u64,
643     bios_size: Option<u64>,
644 ) -> Vec<(GuestAddress, u64, MemoryRegionOptions)> {
645     let mem_start = START_OF_RAM_32BITS;
646     let mem_end = GuestAddress(size + mem_start);
647 
648     let first_addr_past_32bits = GuestAddress(FIRST_ADDR_PAST_32BITS);
649     let end_32bit_gap_start = GuestAddress(read_pci_mmio_before_32bit().start);
650 
651     let mut regions = Vec::new();
652     if mem_end <= end_32bit_gap_start {
653         regions.push((GuestAddress(mem_start), size, Default::default()));
654         if let Some(bios_size) = bios_size {
655             regions.push((bios_start(bios_size), bios_size, Default::default()));
656         }
657     } else {
658         regions.push((
659             GuestAddress(mem_start),
660             end_32bit_gap_start.offset() - mem_start,
661             Default::default(),
662         ));
663         if let Some(bios_size) = bios_size {
664             regions.push((bios_start(bios_size), bios_size, Default::default()));
665         }
666         regions.push((
667             first_addr_past_32bits,
668             mem_end.offset_from(end_32bit_gap_start),
669             Default::default(),
670         ));
671     }
672 
673     regions
674 }
675 
676 impl arch::LinuxArch for X8664arch {
677     type Error = Error;
678 
guest_memory_layout( components: &VmComponents, _hypervisor: &impl Hypervisor, ) -> std::result::Result<Vec<(GuestAddress, u64, MemoryRegionOptions)>, Self::Error>679     fn guest_memory_layout(
680         components: &VmComponents,
681         _hypervisor: &impl Hypervisor,
682     ) -> std::result::Result<Vec<(GuestAddress, u64, MemoryRegionOptions)>, Self::Error> {
683         init_low_memory_layout(components.pcie_ecam, components.pci_low_start);
684 
685         let bios_size = match &components.vm_image {
686             VmImage::Bios(bios_file) => Some(bios_file.metadata().map_err(Error::LoadBios)?.len()),
687             VmImage::Kernel(_) => None,
688         };
689 
690         Ok(arch_memory_regions(components.memory_size, bios_size))
691     }
692 
get_system_allocator_config<V: Vm>(vm: &V) -> SystemAllocatorConfig693     fn get_system_allocator_config<V: Vm>(vm: &V) -> SystemAllocatorConfig {
694         SystemAllocatorConfig {
695             io: Some(AddressRange {
696                 start: 0xc000,
697                 end: 0xffff,
698             }),
699             low_mmio: read_pci_mmio_before_32bit(),
700             high_mmio: Self::get_high_mmio_range(vm),
701             platform_mmio: None,
702             first_irq: X86_64_IRQ_BASE,
703         }
704     }
705 
build_vm<V, Vcpu>( mut components: VmComponents, vm_evt_wrtube: &SendTube, system_allocator: &mut SystemAllocator, serial_parameters: &BTreeMap<(SerialHardware, u8), SerialParameters>, serial_jail: Option<Minijail>, battery: (Option<BatteryType>, Option<Minijail>), mut vm: V, ramoops_region: Option<arch::pstore::RamoopsRegion>, devs: Vec<(Box<dyn BusDeviceObj>, Option<Minijail>)>, irq_chip: &mut dyn IrqChipX86_64, vcpu_ids: &mut Vec<usize>, dump_device_tree_blob: Option<PathBuf>, debugcon_jail: Option<Minijail>, pflash_jail: Option<Minijail>, fw_cfg_jail: Option<Minijail>, #[cfg(feature = "swap")] swap_controller: &mut Option<swap::SwapController>, #[cfg(any(target_os = "android", target_os = "linux"))] guest_suspended_cvar: Option< Arc<(Mutex<bool>, Condvar)>, >, device_tree_overlays: Vec<DtbOverlay>, ) -> std::result::Result<RunnableLinuxVm<V, Vcpu>, Self::Error> where V: VmX86_64, Vcpu: VcpuX86_64,706     fn build_vm<V, Vcpu>(
707         mut components: VmComponents,
708         vm_evt_wrtube: &SendTube,
709         system_allocator: &mut SystemAllocator,
710         serial_parameters: &BTreeMap<(SerialHardware, u8), SerialParameters>,
711         serial_jail: Option<Minijail>,
712         battery: (Option<BatteryType>, Option<Minijail>),
713         mut vm: V,
714         ramoops_region: Option<arch::pstore::RamoopsRegion>,
715         devs: Vec<(Box<dyn BusDeviceObj>, Option<Minijail>)>,
716         irq_chip: &mut dyn IrqChipX86_64,
717         vcpu_ids: &mut Vec<usize>,
718         dump_device_tree_blob: Option<PathBuf>,
719         debugcon_jail: Option<Minijail>,
720         pflash_jail: Option<Minijail>,
721         fw_cfg_jail: Option<Minijail>,
722         #[cfg(feature = "swap")] swap_controller: &mut Option<swap::SwapController>,
723         #[cfg(any(target_os = "android", target_os = "linux"))] guest_suspended_cvar: Option<
724             Arc<(Mutex<bool>, Condvar)>,
725         >,
726         device_tree_overlays: Vec<DtbOverlay>,
727     ) -> std::result::Result<RunnableLinuxVm<V, Vcpu>, Self::Error>
728     where
729         V: VmX86_64,
730         Vcpu: VcpuX86_64,
731     {
732         if components.hv_cfg.protection_type != ProtectionType::Unprotected {
733             return Err(Error::UnsupportedProtectionType);
734         }
735 
736         let mem = vm.get_memory().clone();
737 
738         let vcpu_count = components.vcpu_count;
739 
740         vm.set_identity_map_addr(identity_map_addr_start())
741             .map_err(Error::SetIdentityMapAddr)?;
742 
743         vm.set_tss_addr(tss_addr_start())
744             .map_err(Error::SetTssAddr)?;
745 
746         // Use IRQ info in ACPI if provided by the user.
747         let mut mptable = true;
748         let mut sci_irq = X86_64_SCI_IRQ;
749 
750         // punch pcie config mmio from pci low mmio, so that it couldn't be
751         // allocated to any device.
752         let pcie_cfg_mmio_range = read_pcie_cfg_mmio();
753         system_allocator
754             .reserve_mmio(pcie_cfg_mmio_range)
755             .map_err(Error::ReservePcieCfgMmio)?;
756 
757         for sdt in components.acpi_sdts.iter() {
758             if sdt.is_signature(b"FACP") {
759                 mptable = false;
760                 let sci_irq_fadt: u16 = sdt.read(acpi::FADT_FIELD_SCI_INTERRUPT);
761                 sci_irq = sci_irq_fadt.into();
762                 if !system_allocator.reserve_irq(sci_irq) {
763                     warn!("sci irq {} already reserved.", sci_irq);
764                 }
765             }
766         }
767 
768         let pcie_vcfg_range = Self::get_pcie_vcfg_mmio_range(&mem, &pcie_cfg_mmio_range);
769         let mmio_bus = Arc::new(Bus::new(BusType::Mmio));
770         let io_bus = Arc::new(Bus::new(BusType::Io));
771 
772         let (pci_devices, devs): (Vec<_>, Vec<_>) = devs
773             .into_iter()
774             .partition(|(dev, _)| dev.as_pci_device().is_some());
775 
776         let pci_devices = pci_devices
777             .into_iter()
778             .map(|(dev, jail_orig)| (dev.into_pci_device().unwrap(), jail_orig))
779             .collect();
780 
781         let (pci, pci_irqs, mut pid_debug_label_map, amls, gpe_scope_amls) =
782             arch::generate_pci_root(
783                 pci_devices,
784                 irq_chip.as_irq_chip_mut(),
785                 mmio_bus.clone(),
786                 GuestAddress(pcie_cfg_mmio_range.start),
787                 12,
788                 io_bus.clone(),
789                 system_allocator,
790                 &mut vm,
791                 4, // Share the four pin interrupts (INTx#)
792                 Some(pcie_vcfg_range.start),
793                 #[cfg(feature = "swap")]
794                 swap_controller,
795             )
796             .map_err(Error::CreatePciRoot)?;
797 
798         let pci = Arc::new(Mutex::new(pci));
799         pci.lock().enable_pcie_cfg_mmio(pcie_cfg_mmio_range.start);
800         let pci_cfg = PciConfigIo::new(
801             pci.clone(),
802             components.break_linux_pci_config_io,
803             vm_evt_wrtube.try_clone().map_err(Error::CloneTube)?,
804         );
805         let pci_bus = Arc::new(Mutex::new(pci_cfg));
806         io_bus.insert(pci_bus, 0xcf8, 0x8).unwrap();
807 
808         let pcie_cfg_mmio = Arc::new(Mutex::new(PciConfigMmio::new(pci.clone(), 12)));
809         let pcie_cfg_mmio_len = pcie_cfg_mmio_range.len().unwrap();
810         mmio_bus
811             .insert(pcie_cfg_mmio, pcie_cfg_mmio_range.start, pcie_cfg_mmio_len)
812             .unwrap();
813 
814         let pcie_vcfg_mmio = Arc::new(Mutex::new(PciVirtualConfigMmio::new(pci.clone(), 13)));
815         mmio_bus
816             .insert(
817                 pcie_vcfg_mmio,
818                 pcie_vcfg_range.start,
819                 pcie_vcfg_range.len().unwrap(),
820             )
821             .unwrap();
822 
823         let (virtio_mmio_devices, _others): (Vec<_>, Vec<_>) = devs
824             .into_iter()
825             .partition(|(dev, _)| dev.as_virtio_mmio_device().is_some());
826 
827         let virtio_mmio_devices = virtio_mmio_devices
828             .into_iter()
829             .map(|(dev, jail_orig)| (*(dev.into_virtio_mmio_device().unwrap()), jail_orig))
830             .collect();
831         let (mut virtio_mmio_pid, sdts) = arch::generate_virtio_mmio_bus(
832             virtio_mmio_devices,
833             irq_chip.as_irq_chip_mut(),
834             &mmio_bus,
835             system_allocator,
836             &mut vm,
837             components.acpi_sdts,
838             #[cfg(feature = "swap")]
839             swap_controller,
840         )
841         .map_err(Error::CreateVirtioMmioBus)?;
842         components.acpi_sdts = sdts;
843         pid_debug_label_map.append(&mut virtio_mmio_pid);
844 
845         // Event used to notify crosvm that guest OS is trying to suspend.
846         let suspend_evt = Event::new().map_err(Error::CreateEvent)?;
847 
848         if components.fw_cfg_enable {
849             Self::setup_fw_cfg_device(
850                 &io_bus,
851                 components.fw_cfg_parameters.clone(),
852                 components.bootorder_fw_cfg_blob.clone(),
853                 fw_cfg_jail,
854                 #[cfg(feature = "swap")]
855                 swap_controller,
856             )?;
857         }
858 
859         if !components.no_i8042 {
860             Self::setup_legacy_i8042_device(
861                 &io_bus,
862                 irq_chip.pit_uses_speaker_port(),
863                 vm_evt_wrtube.try_clone().map_err(Error::CloneTube)?,
864             )?;
865         }
866         let vm_request_tube = if !components.no_rtc {
867             let (host_tube, device_tube) = Tube::pair()
868                 .context("create tube")
869                 .map_err(Error::SetupCmos)?;
870             Self::setup_legacy_cmos_device(&io_bus, irq_chip, device_tube, components.memory_size)
871                 .map_err(Error::SetupCmos)?;
872             Some(host_tube)
873         } else {
874             None
875         };
876         let serial_devices = Self::setup_serial_devices(
877             components.hv_cfg.protection_type,
878             irq_chip.as_irq_chip_mut(),
879             &io_bus,
880             serial_parameters,
881             serial_jail,
882             #[cfg(feature = "swap")]
883             swap_controller,
884         )?;
885         Self::setup_debugcon_devices(
886             components.hv_cfg.protection_type,
887             &io_bus,
888             serial_parameters,
889             debugcon_jail,
890             #[cfg(feature = "swap")]
891             swap_controller,
892         )?;
893 
894         let bios_size = if let VmImage::Bios(ref bios) = components.vm_image {
895             bios.metadata().map_err(Error::LoadBios)?.len()
896         } else {
897             0
898         };
899         if let Some(pflash_image) = components.pflash_image {
900             Self::setup_pflash(
901                 pflash_image,
902                 components.pflash_block_size,
903                 bios_size,
904                 &mmio_bus,
905                 pflash_jail,
906                 #[cfg(feature = "swap")]
907                 swap_controller,
908             )?;
909         }
910 
911         // Functions that use/create jails MUST be used before the call to
912         // setup_acpi_devices below, as this move us into a multiprocessing state
913         // from which we can no longer fork.
914 
915         let mut resume_notify_devices = Vec::new();
916 
917         // each bus occupy 1MB mmio for pcie enhanced configuration
918         let max_bus = (pcie_cfg_mmio_len / 0x100000 - 1) as u8;
919         let (mut acpi_dev_resource, bat_control) = Self::setup_acpi_devices(
920             pci.clone(),
921             &mem,
922             &io_bus,
923             system_allocator,
924             suspend_evt.try_clone().map_err(Error::CloneEvent)?,
925             vm_evt_wrtube.try_clone().map_err(Error::CloneTube)?,
926             components.acpi_sdts,
927             irq_chip.as_irq_chip_mut(),
928             sci_irq,
929             battery,
930             &mmio_bus,
931             max_bus,
932             &mut resume_notify_devices,
933             #[cfg(feature = "swap")]
934             swap_controller,
935             #[cfg(any(target_os = "android", target_os = "linux"))]
936             components.ac_adapter,
937             #[cfg(any(target_os = "android", target_os = "linux"))]
938             guest_suspended_cvar,
939             &pci_irqs,
940         )?;
941 
942         // Create customized SSDT table
943         let sdt = acpi::create_customize_ssdt(pci.clone(), amls, gpe_scope_amls);
944         if let Some(sdt) = sdt {
945             acpi_dev_resource.sdts.push(sdt);
946         }
947 
948         irq_chip
949             .finalize_devices(system_allocator, &io_bus, &mmio_bus)
950             .map_err(Error::RegisterIrqfd)?;
951 
952         // All of these bios generated tables are set manually for the benefit of the kernel boot
953         // flow (since there's no BIOS to set it) and for the BIOS boot flow since crosvm doesn't
954         // have a way to pass the BIOS these configs.
955         // This works right now because the only guest BIOS used with crosvm (u-boot) ignores these
956         // tables and the guest OS picks them up.
957         // If another guest does need a way to pass these tables down to it's BIOS, this approach
958         // should be rethought.
959 
960         if mptable {
961             // Note that this puts the mptable at 0x9FC00 in guest physical memory.
962             mptable::setup_mptable(&mem, vcpu_count as u8, &pci_irqs)
963                 .map_err(Error::SetupMptable)?;
964         }
965         smbios::setup_smbios(&mem, &components.smbios, bios_size).map_err(Error::SetupSmbios)?;
966 
967         let host_cpus = if components.host_cpu_topology {
968             components.vcpu_affinity.clone()
969         } else {
970             None
971         };
972 
973         // TODO (tjeznach) Write RSDP to bootconfig before writing to memory
974         acpi::create_acpi_tables(
975             &mem,
976             vcpu_count as u8,
977             sci_irq,
978             0xcf9,
979             6, // RST_CPU|SYS_RST
980             &acpi_dev_resource,
981             host_cpus,
982             vcpu_ids,
983             &pci_irqs,
984             pcie_cfg_mmio_range.start,
985             max_bus,
986             components.force_s2idle,
987         )
988         .ok_or(Error::CreateAcpi)?;
989 
990         let mut cmdline = Self::get_base_linux_cmdline();
991 
992         get_serial_cmdline(&mut cmdline, serial_parameters, "io", &serial_devices)
993             .map_err(Error::GetSerialCmdline)?;
994 
995         for param in components.extra_kernel_params {
996             cmdline.insert_str(&param).map_err(Error::Cmdline)?;
997         }
998 
999         if let Some(ramoops_region) = ramoops_region {
1000             arch::pstore::add_ramoops_kernel_cmdline(&mut cmdline, &ramoops_region)
1001                 .map_err(Error::Cmdline)?;
1002         }
1003 
1004         let pci_start = read_pci_mmio_before_32bit().start;
1005 
1006         let mut vcpu_init = vec![VcpuInitX86_64::default(); vcpu_count];
1007         let mut msrs = BTreeMap::new();
1008 
1009         match components.vm_image {
1010             VmImage::Bios(ref mut bios) => {
1011                 // Allow a bios to hardcode CMDLINE_OFFSET and read the kernel command line from it.
1012                 kernel_loader::load_cmdline(
1013                     &mem,
1014                     GuestAddress(CMDLINE_OFFSET),
1015                     &CString::new(cmdline).unwrap(),
1016                 )
1017                 .map_err(Error::LoadCmdline)?;
1018                 Self::load_bios(&mem, bios)?;
1019                 regs::set_default_msrs(&mut msrs);
1020                 // The default values for `Regs` and `Sregs` already set up the reset vector.
1021             }
1022             VmImage::Kernel(ref mut kernel_image) => {
1023                 let (params, kernel_end, kernel_entry) = Self::load_kernel(&mem, kernel_image)?;
1024 
1025                 Self::setup_system_memory(
1026                     &mem,
1027                     &CString::new(cmdline).unwrap(),
1028                     components.initrd_image,
1029                     components.android_fstab,
1030                     kernel_end,
1031                     params,
1032                     dump_device_tree_blob,
1033                     device_tree_overlays,
1034                 )?;
1035 
1036                 // Configure the bootstrap VCPU for the Linux/x86 64-bit boot protocol.
1037                 // <https://www.kernel.org/doc/html/latest/x86/boot.html>
1038                 vcpu_init[0].regs.rip = kernel_entry.offset();
1039                 vcpu_init[0].regs.rsp = BOOT_STACK_POINTER;
1040                 vcpu_init[0].regs.rsi = ZERO_PAGE_OFFSET;
1041 
1042                 regs::set_long_mode_msrs(&mut msrs);
1043                 regs::set_mtrr_msrs(&mut msrs, &vm, pci_start);
1044 
1045                 // Set up long mode and enable paging.
1046                 regs::configure_segments_and_sregs(&mem, &mut vcpu_init[0].sregs)
1047                     .map_err(Error::ConfigureSegments)?;
1048                 regs::setup_page_tables(&mem, &mut vcpu_init[0].sregs)
1049                     .map_err(Error::SetupPageTables)?;
1050             }
1051         }
1052 
1053         // Initialize MSRs for all VCPUs.
1054         for vcpu in vcpu_init.iter_mut() {
1055             vcpu.msrs = msrs.clone();
1056         }
1057 
1058         Ok(RunnableLinuxVm {
1059             vm,
1060             vcpu_count,
1061             vcpus: None,
1062             vcpu_affinity: components.vcpu_affinity,
1063             vcpu_init,
1064             no_smt: components.no_smt,
1065             irq_chip: irq_chip.try_box_clone().map_err(Error::CloneIrqChip)?,
1066             io_bus,
1067             mmio_bus,
1068             pid_debug_label_map,
1069             suspend_evt,
1070             resume_notify_devices,
1071             rt_cpus: components.rt_cpus,
1072             delay_rt: components.delay_rt,
1073             bat_control,
1074             #[cfg(feature = "gdb")]
1075             gdb: components.gdb,
1076             pm: Some(acpi_dev_resource.pm),
1077             root_config: pci,
1078             #[cfg(any(target_os = "android", target_os = "linux"))]
1079             platform_devices: Vec::new(),
1080             hotplug_bus: BTreeMap::new(),
1081             devices_thread: None,
1082             vm_request_tube,
1083         })
1084     }
1085 
configure_vcpu<V: Vm>( vm: &V, hypervisor: &dyn HypervisorX86_64, irq_chip: &mut dyn IrqChipX86_64, vcpu: &mut dyn VcpuX86_64, vcpu_init: VcpuInitX86_64, vcpu_id: usize, num_cpus: usize, cpu_config: Option<CpuConfigX86_64>, ) -> Result<()>1086     fn configure_vcpu<V: Vm>(
1087         vm: &V,
1088         hypervisor: &dyn HypervisorX86_64,
1089         irq_chip: &mut dyn IrqChipX86_64,
1090         vcpu: &mut dyn VcpuX86_64,
1091         vcpu_init: VcpuInitX86_64,
1092         vcpu_id: usize,
1093         num_cpus: usize,
1094         cpu_config: Option<CpuConfigX86_64>,
1095     ) -> Result<()> {
1096         let cpu_config = match cpu_config {
1097             Some(config) => config,
1098             None => return Err(Error::InvalidCpuConfig),
1099         };
1100         if !vm.check_capability(VmCap::EarlyInitCpuid) {
1101             cpuid::setup_cpuid(hypervisor, irq_chip, vcpu, vcpu_id, num_cpus, cpu_config)
1102                 .map_err(Error::SetupCpuid)?;
1103         }
1104 
1105         vcpu.set_regs(&vcpu_init.regs).map_err(Error::WriteRegs)?;
1106 
1107         vcpu.set_sregs(&vcpu_init.sregs)
1108             .map_err(Error::SetupSregs)?;
1109 
1110         vcpu.set_fpu(&vcpu_init.fpu).map_err(Error::SetupFpu)?;
1111 
1112         let vcpu_supported_var_mtrrs = regs::vcpu_supported_variable_mtrrs(vcpu);
1113         let num_var_mtrrs = regs::count_variable_mtrrs(&vcpu_init.msrs);
1114         let skip_mtrr_msrs = if num_var_mtrrs > vcpu_supported_var_mtrrs {
1115             warn!(
1116                 "Too many variable MTRR entries ({} required, {} supported),
1117                 please check pci_start addr, guest with pass through device may be very slow",
1118                 num_var_mtrrs, vcpu_supported_var_mtrrs,
1119             );
1120             // Filter out the MTRR entries from the MSR list.
1121             true
1122         } else {
1123             false
1124         };
1125 
1126         for (msr_index, value) in vcpu_init.msrs.into_iter() {
1127             if skip_mtrr_msrs && regs::is_mtrr_msr(msr_index) {
1128                 continue;
1129             }
1130 
1131             vcpu.set_msr(msr_index, value).map_err(Error::SetupMsrs)?;
1132         }
1133 
1134         interrupts::set_lint(vcpu_id, irq_chip).map_err(Error::SetLint)?;
1135 
1136         Ok(())
1137     }
1138 
register_pci_device<V: VmX86_64, Vcpu: VcpuX86_64>( linux: &mut RunnableLinuxVm<V, Vcpu>, device: Box<dyn PciDevice>, #[cfg(any(target_os = "android", target_os = "linux"))] minijail: Option<Minijail>, resources: &mut SystemAllocator, hp_control_tube: &mpsc::Sender<PciRootCommand>, #[cfg(feature = "swap")] swap_controller: &mut Option<swap::SwapController>, ) -> Result<PciAddress>1139     fn register_pci_device<V: VmX86_64, Vcpu: VcpuX86_64>(
1140         linux: &mut RunnableLinuxVm<V, Vcpu>,
1141         device: Box<dyn PciDevice>,
1142         #[cfg(any(target_os = "android", target_os = "linux"))] minijail: Option<Minijail>,
1143         resources: &mut SystemAllocator,
1144         hp_control_tube: &mpsc::Sender<PciRootCommand>,
1145         #[cfg(feature = "swap")] swap_controller: &mut Option<swap::SwapController>,
1146     ) -> Result<PciAddress> {
1147         arch::configure_pci_device(
1148             linux,
1149             device,
1150             #[cfg(any(target_os = "android", target_os = "linux"))]
1151             minijail,
1152             resources,
1153             hp_control_tube,
1154             #[cfg(feature = "swap")]
1155             swap_controller,
1156         )
1157         .map_err(Error::ConfigurePciDevice)
1158     }
1159 
get_host_cpu_frequencies_khz() -> Result<BTreeMap<usize, Vec<u32>>>1160     fn get_host_cpu_frequencies_khz() -> Result<BTreeMap<usize, Vec<u32>>> {
1161         Ok(BTreeMap::new())
1162     }
1163 
get_host_cpu_capacity() -> Result<BTreeMap<usize, u32>>1164     fn get_host_cpu_capacity() -> Result<BTreeMap<usize, u32>> {
1165         Ok(BTreeMap::new())
1166     }
1167 
get_host_cpu_clusters() -> Result<Vec<CpuSet>>1168     fn get_host_cpu_clusters() -> Result<Vec<CpuSet>> {
1169         Ok(Vec::new())
1170     }
1171 }
1172 
1173 #[cfg(feature = "gdb")]
1174 impl<T: VcpuX86_64> arch::GdbOps<T> for X8664arch {
1175     type Error = Error;
1176 
read_registers(vcpu: &T) -> Result<X86_64CoreRegs>1177     fn read_registers(vcpu: &T) -> Result<X86_64CoreRegs> {
1178         // General registers: RAX, RBX, RCX, RDX, RSI, RDI, RBP, RSP, r8-r15
1179         let gregs = vcpu.get_regs().map_err(Error::ReadRegs)?;
1180         let regs = [
1181             gregs.rax, gregs.rbx, gregs.rcx, gregs.rdx, gregs.rsi, gregs.rdi, gregs.rbp, gregs.rsp,
1182             gregs.r8, gregs.r9, gregs.r10, gregs.r11, gregs.r12, gregs.r13, gregs.r14, gregs.r15,
1183         ];
1184 
1185         // GDB exposes 32-bit eflags instead of 64-bit rflags.
1186         // https://github.com/bminor/binutils-gdb/blob/master/gdb/features/i386/64bit-core.xml
1187         let eflags = gregs.rflags as u32;
1188         let rip = gregs.rip;
1189 
1190         // Segment registers: CS, SS, DS, ES, FS, GS
1191         let sregs = vcpu.get_sregs().map_err(Error::ReadRegs)?;
1192         let segments = X86SegmentRegs {
1193             cs: sregs.cs.selector as u32,
1194             ss: sregs.ss.selector as u32,
1195             ds: sregs.ds.selector as u32,
1196             es: sregs.es.selector as u32,
1197             fs: sregs.fs.selector as u32,
1198             gs: sregs.gs.selector as u32,
1199         };
1200 
1201         // x87 FPU internal state
1202         // TODO(dverkamp): floating point tag word, instruction pointer, and data pointer
1203         let fpu = vcpu.get_fpu().map_err(Error::ReadRegs)?;
1204         let fpu_internal = X87FpuInternalRegs {
1205             fctrl: u32::from(fpu.fcw),
1206             fstat: u32::from(fpu.fsw),
1207             fop: u32::from(fpu.last_opcode),
1208             ..Default::default()
1209         };
1210 
1211         let mut regs = X86_64CoreRegs {
1212             regs,
1213             eflags,
1214             rip,
1215             segments,
1216             st: Default::default(),
1217             fpu: fpu_internal,
1218             xmm: Default::default(),
1219             mxcsr: fpu.mxcsr,
1220         };
1221 
1222         // x87 FPU registers: ST0-ST7
1223         for (dst, src) in regs.st.iter_mut().zip(fpu.fpr.iter()) {
1224             // `fpr` contains the x87 floating point registers in FXSAVE format.
1225             // Each element contains an 80-bit floating point value in the low 10 bytes.
1226             // The upper 6 bytes are reserved and can be ignored.
1227             dst.copy_from_slice(&src[0..10])
1228         }
1229 
1230         // SSE registers: XMM0-XMM15
1231         for (dst, src) in regs.xmm.iter_mut().zip(fpu.xmm.iter()) {
1232             *dst = u128::from_le_bytes(*src);
1233         }
1234 
1235         Ok(regs)
1236     }
1237 
write_registers(vcpu: &T, regs: &X86_64CoreRegs) -> Result<()>1238     fn write_registers(vcpu: &T, regs: &X86_64CoreRegs) -> Result<()> {
1239         // General purpose registers (RAX, RBX, RCX, RDX, RSI, RDI, RBP, RSP, r8-r15) + RIP + rflags
1240         let orig_gregs = vcpu.get_regs().map_err(Error::ReadRegs)?;
1241         let gregs = Regs {
1242             rax: regs.regs[0],
1243             rbx: regs.regs[1],
1244             rcx: regs.regs[2],
1245             rdx: regs.regs[3],
1246             rsi: regs.regs[4],
1247             rdi: regs.regs[5],
1248             rbp: regs.regs[6],
1249             rsp: regs.regs[7],
1250             r8: regs.regs[8],
1251             r9: regs.regs[9],
1252             r10: regs.regs[10],
1253             r11: regs.regs[11],
1254             r12: regs.regs[12],
1255             r13: regs.regs[13],
1256             r14: regs.regs[14],
1257             r15: regs.regs[15],
1258             rip: regs.rip,
1259             // Update the lower 32 bits of rflags.
1260             rflags: (orig_gregs.rflags & !(u32::MAX as u64)) | (regs.eflags as u64),
1261         };
1262         vcpu.set_regs(&gregs).map_err(Error::WriteRegs)?;
1263 
1264         // Segment registers: CS, SS, DS, ES, FS, GS
1265         // Since GDB care only selectors, we call get_sregs() first.
1266         let mut sregs = vcpu.get_sregs().map_err(Error::ReadRegs)?;
1267         sregs.cs.selector = regs.segments.cs as u16;
1268         sregs.ss.selector = regs.segments.ss as u16;
1269         sregs.ds.selector = regs.segments.ds as u16;
1270         sregs.es.selector = regs.segments.es as u16;
1271         sregs.fs.selector = regs.segments.fs as u16;
1272         sregs.gs.selector = regs.segments.gs as u16;
1273 
1274         vcpu.set_sregs(&sregs).map_err(Error::WriteRegs)?;
1275 
1276         // FPU and SSE registers
1277         let mut fpu = vcpu.get_fpu().map_err(Error::ReadRegs)?;
1278         fpu.fcw = regs.fpu.fctrl as u16;
1279         fpu.fsw = regs.fpu.fstat as u16;
1280         fpu.last_opcode = regs.fpu.fop as u16;
1281         // TODO(dverkamp): floating point tag word, instruction pointer, and data pointer
1282 
1283         // x87 FPU registers: ST0-ST7
1284         for (dst, src) in fpu.fpr.iter_mut().zip(regs.st.iter()) {
1285             dst[0..10].copy_from_slice(src);
1286         }
1287 
1288         // SSE registers: XMM0-XMM15
1289         for (dst, src) in fpu.xmm.iter_mut().zip(regs.xmm.iter()) {
1290             dst.copy_from_slice(&src.to_le_bytes());
1291         }
1292 
1293         vcpu.set_fpu(&fpu).map_err(Error::WriteRegs)?;
1294 
1295         Ok(())
1296     }
1297 
1298     #[inline]
read_register(_vcpu: &T, _reg: X86_64CoreRegId) -> Result<Vec<u8>>1299     fn read_register(_vcpu: &T, _reg: X86_64CoreRegId) -> Result<Vec<u8>> {
1300         Err(Error::ReadRegIsUnsupported)
1301     }
1302 
1303     #[inline]
write_register(_vcpu: &T, _reg: X86_64CoreRegId, _buf: &[u8]) -> Result<()>1304     fn write_register(_vcpu: &T, _reg: X86_64CoreRegId, _buf: &[u8]) -> Result<()> {
1305         Err(Error::WriteRegIsUnsupported)
1306     }
1307 
read_memory( vcpu: &T, guest_mem: &GuestMemory, vaddr: GuestAddress, len: usize, ) -> Result<Vec<u8>>1308     fn read_memory(
1309         vcpu: &T,
1310         guest_mem: &GuestMemory,
1311         vaddr: GuestAddress,
1312         len: usize,
1313     ) -> Result<Vec<u8>> {
1314         let sregs = vcpu.get_sregs().map_err(Error::ReadRegs)?;
1315         let mut buf = vec![0; len];
1316         let mut total_read = 0u64;
1317         // Handle reads across page boundaries.
1318 
1319         while total_read < len as u64 {
1320             let (paddr, psize) = phys_addr(guest_mem, vaddr.0 + total_read, &sregs)?;
1321             let read_len = std::cmp::min(len as u64 - total_read, psize - (paddr & (psize - 1)));
1322             guest_mem
1323                 .get_slice_at_addr(GuestAddress(paddr), read_len as usize)
1324                 .map_err(Error::ReadingGuestMemory)?
1325                 .copy_to(&mut buf[total_read as usize..]);
1326             total_read += read_len;
1327         }
1328         Ok(buf)
1329     }
1330 
write_memory( vcpu: &T, guest_mem: &GuestMemory, vaddr: GuestAddress, buf: &[u8], ) -> Result<()>1331     fn write_memory(
1332         vcpu: &T,
1333         guest_mem: &GuestMemory,
1334         vaddr: GuestAddress,
1335         buf: &[u8],
1336     ) -> Result<()> {
1337         let sregs = vcpu.get_sregs().map_err(Error::ReadRegs)?;
1338         let mut total_written = 0u64;
1339         // Handle writes across page boundaries.
1340         while total_written < buf.len() as u64 {
1341             let (paddr, psize) = phys_addr(guest_mem, vaddr.0 + total_written, &sregs)?;
1342             let write_len = std::cmp::min(
1343                 buf.len() as u64 - total_written,
1344                 psize - (paddr & (psize - 1)),
1345             );
1346 
1347             guest_mem
1348                 .write_all_at_addr(
1349                     &buf[total_written as usize..(total_written as usize + write_len as usize)],
1350                     GuestAddress(paddr),
1351                 )
1352                 .map_err(Error::WritingGuestMemory)?;
1353             total_written += write_len;
1354         }
1355         Ok(())
1356     }
1357 
enable_singlestep(vcpu: &T) -> Result<()>1358     fn enable_singlestep(vcpu: &T) -> Result<()> {
1359         vcpu.set_guest_debug(&[], true /* enable_singlestep */)
1360             .map_err(Error::EnableSinglestep)
1361     }
1362 
get_max_hw_breakpoints(_vcpu: &T) -> Result<usize>1363     fn get_max_hw_breakpoints(_vcpu: &T) -> Result<usize> {
1364         Ok(4usize)
1365     }
1366 
set_hw_breakpoints(vcpu: &T, breakpoints: &[GuestAddress]) -> Result<()>1367     fn set_hw_breakpoints(vcpu: &T, breakpoints: &[GuestAddress]) -> Result<()> {
1368         vcpu.set_guest_debug(breakpoints, false /* enable_singlestep */)
1369             .map_err(Error::SetHwBreakpoint)
1370     }
1371 }
1372 
1373 #[cfg(feature = "gdb")]
1374 // return the translated address and the size of the page it resides in.
phys_addr(mem: &GuestMemory, vaddr: u64, sregs: &Sregs) -> Result<(u64, u64)>1375 fn phys_addr(mem: &GuestMemory, vaddr: u64, sregs: &Sregs) -> Result<(u64, u64)> {
1376     const CR0_PG_MASK: u64 = 1 << 31;
1377     const CR4_PAE_MASK: u64 = 1 << 5;
1378     const CR4_LA57_MASK: u64 = 1 << 12;
1379     const MSR_EFER_LMA: u64 = 1 << 10;
1380     // bits 12 through 51 are the address in a PTE.
1381     const PTE_ADDR_MASK: u64 = ((1 << 52) - 1) & !0x0fff;
1382     const PAGE_PRESENT: u64 = 0x1;
1383     const PAGE_PSE_MASK: u64 = 0x1 << 7;
1384 
1385     const PAGE_SIZE_4K: u64 = 4 * 1024;
1386     const PAGE_SIZE_2M: u64 = 2 * 1024 * 1024;
1387     const PAGE_SIZE_1G: u64 = 1024 * 1024 * 1024;
1388 
1389     fn next_pte(mem: &GuestMemory, curr_table_addr: u64, vaddr: u64, level: usize) -> Result<u64> {
1390         let ent: u64 = mem
1391             .read_obj_from_addr(GuestAddress(
1392                 (curr_table_addr & PTE_ADDR_MASK) + page_table_offset(vaddr, level),
1393             ))
1394             .map_err(|_| Error::TranslatingVirtAddr)?;
1395         /* TODO - convert to a trace
1396         println!(
1397             "level {} vaddr {:x} table-addr {:x} mask {:x} ent {:x} offset {:x}",
1398             level,
1399             vaddr,
1400             curr_table_addr,
1401             PTE_ADDR_MASK,
1402             ent,
1403             page_table_offset(vaddr, level)
1404         );
1405         */
1406         if ent & PAGE_PRESENT == 0 {
1407             return Err(Error::PageNotPresent);
1408         }
1409         Ok(ent)
1410     }
1411 
1412     // Get the offset in to the page of `vaddr`.
1413     fn page_offset(vaddr: u64, page_size: u64) -> u64 {
1414         vaddr & (page_size - 1)
1415     }
1416 
1417     // Get the offset in to the page table of the given `level` specified by the virtual `address`.
1418     // `level` is 1 through 5 in x86_64 to handle the five levels of paging.
1419     fn page_table_offset(addr: u64, level: usize) -> u64 {
1420         let offset = (level - 1) * 9 + 12;
1421         ((addr >> offset) & 0x1ff) << 3
1422     }
1423 
1424     if sregs.cr0 & CR0_PG_MASK == 0 {
1425         return Ok((vaddr, PAGE_SIZE_4K));
1426     }
1427 
1428     if sregs.cr4 & CR4_PAE_MASK == 0 {
1429         return Err(Error::TranslatingVirtAddr);
1430     }
1431 
1432     if sregs.efer & MSR_EFER_LMA != 0 {
1433         // TODO - check LA57
1434         if sregs.cr4 & CR4_LA57_MASK != 0 {
1435             todo!("handle LA57");
1436         }
1437         let p4_ent = next_pte(mem, sregs.cr3, vaddr, 4)?;
1438         let p3_ent = next_pte(mem, p4_ent, vaddr, 3)?;
1439         // TODO check if it's a 1G page with the PSE bit in p2_ent
1440         if p3_ent & PAGE_PSE_MASK != 0 {
1441             // It's a 1G page with the PSE bit in p3_ent
1442             let paddr = p3_ent & PTE_ADDR_MASK | page_offset(vaddr, PAGE_SIZE_1G);
1443             return Ok((paddr, PAGE_SIZE_1G));
1444         }
1445         let p2_ent = next_pte(mem, p3_ent, vaddr, 2)?;
1446         if p2_ent & PAGE_PSE_MASK != 0 {
1447             // It's a 2M page with the PSE bit in p2_ent
1448             let paddr = p2_ent & PTE_ADDR_MASK | page_offset(vaddr, PAGE_SIZE_2M);
1449             return Ok((paddr, PAGE_SIZE_2M));
1450         }
1451         let p1_ent = next_pte(mem, p2_ent, vaddr, 1)?;
1452         let paddr = p1_ent & PTE_ADDR_MASK | page_offset(vaddr, PAGE_SIZE_4K);
1453         return Ok((paddr, PAGE_SIZE_4K));
1454     }
1455     Err(Error::TranslatingVirtAddr)
1456 }
1457 
1458 // OSC returned status register in CDW1
1459 const OSC_STATUS_UNSUPPORT_UUID: u32 = 0x4;
1460 // pci host bridge OSC returned control register in CDW3
1461 #[allow(dead_code)]
1462 const PCI_HB_OSC_CONTROL_PCIE_HP: u32 = 0x1;
1463 const PCI_HB_OSC_CONTROL_SHPC_HP: u32 = 0x2;
1464 #[allow(dead_code)]
1465 const PCI_HB_OSC_CONTROL_PCIE_PME: u32 = 0x4;
1466 const PCI_HB_OSC_CONTROL_PCIE_AER: u32 = 0x8;
1467 #[allow(dead_code)]
1468 const PCI_HB_OSC_CONTROL_PCIE_CAP: u32 = 0x10;
1469 
1470 struct PciRootOSC {}
1471 
1472 // Method (_OSC, 4, NotSerialized)  // _OSC: Operating System Capabilities
1473 // {
1474 //     CreateDWordField (Arg3, Zero, CDW1)  // flag and return value
1475 //     If (Arg0 == ToUUID ("33db4d5b-1ff7-401c-9657-7441c03dd766"))
1476 //     {
1477 //         CreateDWordField (Arg3, 8, CDW3) // control field
1478 //         if ( 0 == (CDW1 & 0x01))  // Query flag ?
1479 //         {
1480 //              CDW3 &= !(SHPC_HP | AER)
1481 //         }
1482 //     } Else {
1483 //         CDW1 |= UNSUPPORT_UUID
1484 //     }
1485 //     Return (Arg3)
1486 // }
1487 impl Aml for PciRootOSC {
to_aml_bytes(&self, aml: &mut Vec<u8>)1488     fn to_aml_bytes(&self, aml: &mut Vec<u8>) {
1489         let osc_uuid = "33DB4D5B-1FF7-401C-9657-7441C03DD766";
1490         // virtual pcie root port supports hotplug, pme, and pcie cap register, clear all
1491         // the other bits.
1492         let mask = !(PCI_HB_OSC_CONTROL_SHPC_HP | PCI_HB_OSC_CONTROL_PCIE_AER);
1493         aml::Method::new(
1494             "_OSC".into(),
1495             4,
1496             false,
1497             vec![
1498                 &aml::CreateDWordField::new(
1499                     &aml::Name::new_field_name("CDW1"),
1500                     &aml::Arg(3),
1501                     &aml::ZERO,
1502                 ),
1503                 &aml::If::new(
1504                     &aml::Equal::new(&aml::Arg(0), &aml::Uuid::new(osc_uuid)),
1505                     vec![
1506                         &aml::CreateDWordField::new(
1507                             &aml::Name::new_field_name("CDW3"),
1508                             &aml::Arg(3),
1509                             &(8_u8),
1510                         ),
1511                         &aml::If::new(
1512                             &aml::Equal::new(
1513                                 &aml::ZERO,
1514                                 &aml::And::new(
1515                                     &aml::ZERO,
1516                                     &aml::Name::new_field_name("CDW1"),
1517                                     &aml::ONE,
1518                                 ),
1519                             ),
1520                             vec![&aml::And::new(
1521                                 &aml::Name::new_field_name("CDW3"),
1522                                 &mask,
1523                                 &aml::Name::new_field_name("CDW3"),
1524                             )],
1525                         ),
1526                     ],
1527                 ),
1528                 &aml::Else::new(vec![&aml::Or::new(
1529                     &aml::Name::new_field_name("CDW1"),
1530                     &OSC_STATUS_UNSUPPORT_UUID,
1531                     &aml::Name::new_field_name("CDW1"),
1532                 )]),
1533                 &aml::Return::new(&aml::Arg(3)),
1534             ],
1535         )
1536         .to_aml_bytes(aml)
1537     }
1538 }
1539 
1540 impl X8664arch {
1541     /// Loads the bios from an open file.
1542     ///
1543     /// # Arguments
1544     ///
1545     /// * `mem` - The memory to be used by the guest.
1546     /// * `bios_image` - the File object for the specified bios
load_bios(mem: &GuestMemory, bios_image: &mut File) -> Result<()>1547     fn load_bios(mem: &GuestMemory, bios_image: &mut File) -> Result<()> {
1548         let bios_image_length = bios_image.get_len().map_err(Error::LoadBios)?;
1549         if bios_image_length >= FIRST_ADDR_PAST_32BITS {
1550             return Err(Error::LoadBios(io::Error::new(
1551                 io::ErrorKind::InvalidData,
1552                 format!(
1553                     "bios was {} bytes, expected less than {}",
1554                     bios_image_length, FIRST_ADDR_PAST_32BITS,
1555                 ),
1556             )));
1557         }
1558 
1559         let guest_slice = mem
1560             .get_slice_at_addr(bios_start(bios_image_length), bios_image_length as usize)
1561             .map_err(Error::SetupGuestMemory)?;
1562         bios_image
1563             .read_exact_at_volatile(guest_slice, 0)
1564             .map_err(Error::LoadBios)?;
1565         Ok(())
1566     }
1567 
setup_pflash( pflash_image: File, block_size: u32, bios_size: u64, mmio_bus: &Bus, jail: Option<Minijail>, #[cfg(feature = "swap")] swap_controller: &mut Option<swap::SwapController>, ) -> Result<()>1568     fn setup_pflash(
1569         pflash_image: File,
1570         block_size: u32,
1571         bios_size: u64,
1572         mmio_bus: &Bus,
1573         jail: Option<Minijail>,
1574         #[cfg(feature = "swap")] swap_controller: &mut Option<swap::SwapController>,
1575     ) -> Result<()> {
1576         let size = pflash_image.metadata().map_err(Error::LoadPflash)?.len();
1577         let start = FIRST_ADDR_PAST_32BITS - bios_size - size;
1578         let pflash_image = Box::new(pflash_image);
1579 
1580         #[cfg(any(target_os = "android", target_os = "linux"))]
1581         let fds = pflash_image.as_raw_descriptors();
1582 
1583         let pflash = Pflash::new(pflash_image, block_size).map_err(Error::SetupPflash)?;
1584         let pflash: Arc<Mutex<dyn BusDevice>> = match jail {
1585             #[cfg(any(target_os = "android", target_os = "linux"))]
1586             Some(jail) => Arc::new(Mutex::new(
1587                 ProxyDevice::new(
1588                     pflash,
1589                     jail,
1590                     fds,
1591                     #[cfg(feature = "swap")]
1592                     swap_controller,
1593                 )
1594                 .map_err(Error::CreateProxyDevice)?,
1595             )),
1596             #[cfg(windows)]
1597             Some(_) => unreachable!(),
1598             None => Arc::new(Mutex::new(pflash)),
1599         };
1600         mmio_bus
1601             .insert(pflash, start, size)
1602             .map_err(Error::InsertBus)?;
1603 
1604         Ok(())
1605     }
1606 
1607     /// Loads the kernel from an open file.
1608     ///
1609     /// # Arguments
1610     ///
1611     /// * `mem` - The memory to be used by the guest.
1612     /// * `kernel_image` - the File object for the specified kernel.
1613     ///
1614     /// # Returns
1615     ///
1616     /// On success, returns the Linux x86_64 boot protocol parameters, the first address past the
1617     /// end of the kernel, and the entry point (initial `RIP` value).
load_kernel( mem: &GuestMemory, kernel_image: &mut File, ) -> Result<(boot_params, u64, GuestAddress)>1618     fn load_kernel(
1619         mem: &GuestMemory,
1620         kernel_image: &mut File,
1621     ) -> Result<(boot_params, u64, GuestAddress)> {
1622         let kernel_start = GuestAddress(KERNEL_START_OFFSET);
1623         match kernel_loader::load_elf64(mem, kernel_start, kernel_image, 0) {
1624             Ok(loaded_kernel) => {
1625                 // ELF kernels don't contain a `boot_params` structure, so synthesize a default one.
1626                 let boot_params = Default::default();
1627                 Ok((
1628                     boot_params,
1629                     loaded_kernel.address_range.end,
1630                     loaded_kernel.entry,
1631                 ))
1632             }
1633             Err(kernel_loader::Error::InvalidMagicNumber) => {
1634                 // The image failed to parse as ELF, so try to load it as a bzImage.
1635                 let (boot_params, bzimage_end) =
1636                     bzimage::load_bzimage(mem, kernel_start, kernel_image)
1637                         .map_err(Error::LoadBzImage)?;
1638                 let bzimage_entry = mem
1639                     .checked_offset(kernel_start, KERNEL_64BIT_ENTRY_OFFSET)
1640                     .ok_or(Error::KernelOffsetPastEnd)?;
1641                 Ok((boot_params, bzimage_end, bzimage_entry))
1642             }
1643             Err(e) => Err(Error::LoadKernel(e)),
1644         }
1645     }
1646 
1647     /// Configures the system memory space should be called once per vm before
1648     /// starting vcpu threads.
1649     ///
1650     /// # Arguments
1651     ///
1652     /// * `mem` - The memory to be used by the guest.
1653     /// * `cmdline` - the kernel commandline
1654     /// * `initrd_file` - an initial ramdisk image
setup_system_memory( mem: &GuestMemory, cmdline: &CStr, initrd_file: Option<File>, android_fstab: Option<File>, kernel_end: u64, params: boot_params, dump_device_tree_blob: Option<PathBuf>, device_tree_overlays: Vec<DtbOverlay>, ) -> Result<()>1655     pub fn setup_system_memory(
1656         mem: &GuestMemory,
1657         cmdline: &CStr,
1658         initrd_file: Option<File>,
1659         android_fstab: Option<File>,
1660         kernel_end: u64,
1661         params: boot_params,
1662         dump_device_tree_blob: Option<PathBuf>,
1663         device_tree_overlays: Vec<DtbOverlay>,
1664     ) -> Result<()> {
1665         kernel_loader::load_cmdline(mem, GuestAddress(CMDLINE_OFFSET), cmdline)
1666             .map_err(Error::LoadCmdline)?;
1667 
1668         let mut setup_data = Vec::<SetupData>::new();
1669         if let Some(android_fstab) = android_fstab {
1670             setup_data.push(
1671                 fdt::create_fdt(android_fstab, dump_device_tree_blob, device_tree_overlays)
1672                     .map_err(Error::CreateFdt)?,
1673             );
1674         }
1675         setup_data.push(setup_data_rng_seed());
1676 
1677         let setup_data = write_setup_data(
1678             mem,
1679             GuestAddress(SETUP_DATA_START),
1680             GuestAddress(SETUP_DATA_END),
1681             &setup_data,
1682         )?;
1683 
1684         let initrd = match initrd_file {
1685             Some(mut initrd_file) => {
1686                 let mut initrd_addr_max = u64::from(params.hdr.initrd_addr_max);
1687                 // Default initrd_addr_max for old kernels (see Documentation/x86/boot.txt).
1688                 if initrd_addr_max == 0 {
1689                     initrd_addr_max = 0x37FFFFFF;
1690                 }
1691 
1692                 let mem_max = mem.end_addr().offset() - 1;
1693                 if initrd_addr_max > mem_max {
1694                     initrd_addr_max = mem_max;
1695                 }
1696 
1697                 let (initrd_start, initrd_size) = arch::load_image_high(
1698                     mem,
1699                     &mut initrd_file,
1700                     GuestAddress(kernel_end),
1701                     GuestAddress(initrd_addr_max),
1702                     base::pagesize() as u64,
1703                 )
1704                 .map_err(Error::LoadInitrd)?;
1705                 Some((initrd_start, initrd_size))
1706             }
1707             None => None,
1708         };
1709 
1710         configure_system(
1711             mem,
1712             GuestAddress(KERNEL_START_OFFSET),
1713             GuestAddress(CMDLINE_OFFSET),
1714             cmdline.to_bytes().len() + 1,
1715             setup_data,
1716             initrd,
1717             params,
1718         )?;
1719         Ok(())
1720     }
1721 
get_pcie_vcfg_mmio_range(mem: &GuestMemory, pcie_cfg_mmio: &AddressRange) -> AddressRange1722     fn get_pcie_vcfg_mmio_range(mem: &GuestMemory, pcie_cfg_mmio: &AddressRange) -> AddressRange {
1723         // Put PCIe VCFG region at a 2MB boundary after physical memory or 4gb, whichever is
1724         // greater.
1725         let ram_end_round_2mb = (mem.end_addr().offset() + 2 * MB - 1) / (2 * MB) * (2 * MB);
1726         let start = std::cmp::max(ram_end_round_2mb, 4 * GB);
1727         // Each pci device's ECAM size is 4kb and its vcfg size is 8kb
1728         let end = start + pcie_cfg_mmio.len().unwrap() * 2 - 1;
1729         AddressRange { start, end }
1730     }
1731 
1732     /// Returns the high mmio range
get_high_mmio_range<V: Vm>(vm: &V) -> AddressRange1733     fn get_high_mmio_range<V: Vm>(vm: &V) -> AddressRange {
1734         let mem = vm.get_memory();
1735         let start = Self::get_pcie_vcfg_mmio_range(mem, &read_pcie_cfg_mmio()).end + 1;
1736 
1737         let phys_mem_end = (1u64 << vm.get_guest_phys_addr_bits()) - 1;
1738         let high_mmio_end = std::cmp::min(phys_mem_end, HIGH_MMIO_MAX_END);
1739 
1740         AddressRange {
1741             start,
1742             end: high_mmio_end,
1743         }
1744     }
1745 
1746     /// This returns a minimal kernel command for this architecture
get_base_linux_cmdline() -> kernel_cmdline::Cmdline1747     pub fn get_base_linux_cmdline() -> kernel_cmdline::Cmdline {
1748         let mut cmdline = kernel_cmdline::Cmdline::new(CMDLINE_MAX_SIZE as usize);
1749         cmdline.insert_str("panic=-1").unwrap();
1750 
1751         cmdline
1752     }
1753 
1754     /// Sets up fw_cfg device.
1755     ///  # Arguments
1756     ///
1757     /// * - `io_bus` - the IO bus object
1758     /// * - `fw_cfg_parameters` - command-line specified data to add to device. May contain
1759     /// all None fields if user did not specify data to add to the device
setup_fw_cfg_device( io_bus: &Bus, fw_cfg_parameters: Vec<FwCfgParameters>, bootorder_fw_cfg_blob: Vec<u8>, fw_cfg_jail: Option<Minijail>, #[cfg(feature = "swap")] swap_controller: &mut Option<swap::SwapController>, ) -> Result<()>1760     fn setup_fw_cfg_device(
1761         io_bus: &Bus,
1762         fw_cfg_parameters: Vec<FwCfgParameters>,
1763         bootorder_fw_cfg_blob: Vec<u8>,
1764         fw_cfg_jail: Option<Minijail>,
1765         #[cfg(feature = "swap")] swap_controller: &mut Option<swap::SwapController>,
1766     ) -> Result<()> {
1767         let fw_cfg = match devices::FwCfgDevice::new(FW_CFG_MAX_FILE_SLOTS, fw_cfg_parameters) {
1768             Ok(mut device) => {
1769                 // this condition will only be true if the user specified at least one bootindex
1770                 // option on the command line. If none were specified, bootorder_fw_cfg_blob will
1771                 // only have a null byte (null terminator)
1772                 if bootorder_fw_cfg_blob.len() > 1 {
1773                     // Add boot order file to the device. If the file is not present, firmware may
1774                     // not be able to boot.
1775                     if let Err(err) = device.add_file(
1776                         "bootorder",
1777                         bootorder_fw_cfg_blob,
1778                         devices::FwCfgItemType::GenericItem,
1779                     ) {
1780                         return Err(Error::CreateFwCfgDevice(err));
1781                     }
1782                 }
1783                 device
1784             }
1785             Err(err) => {
1786                 return Err(Error::CreateFwCfgDevice(err));
1787             }
1788         };
1789 
1790         let fw_cfg: Arc<Mutex<dyn BusDevice>> = match fw_cfg_jail.as_ref() {
1791             #[cfg(any(target_os = "android", target_os = "linux"))]
1792             Some(jail) => {
1793                 let jail_clone = jail.try_clone().map_err(Error::CloneJail)?;
1794                 #[cfg(feature = "seccomp_trace")]
1795                 debug!(
1796                     "seccomp_trace {{\"event\": \"minijail_clone\", \"src_jail_addr\": \"0x{:x}\", \"dst_jail_addr\": \"0x{:x}\"}}",
1797                     read_jail_addr(jail),
1798                     read_jail_addr(&jail_clone)
1799                 );
1800                 Arc::new(Mutex::new(
1801                     ProxyDevice::new(
1802                         fw_cfg,
1803                         jail_clone,
1804                         Vec::new(),
1805                         #[cfg(feature = "swap")]
1806                         swap_controller,
1807                     )
1808                     .map_err(Error::CreateProxyDevice)?,
1809                 ))
1810             }
1811             #[cfg(windows)]
1812             Some(_) => unreachable!(),
1813             None => Arc::new(Mutex::new(fw_cfg)),
1814         };
1815 
1816         io_bus
1817             .insert(fw_cfg, FW_CFG_BASE_PORT, FW_CFG_WIDTH)
1818             .map_err(Error::InsertBus)?;
1819 
1820         Ok(())
1821     }
1822 
1823     /// Sets up the legacy x86 i8042/KBD platform device
1824     ///
1825     /// # Arguments
1826     ///
1827     /// * - `io_bus` - the IO bus object
1828     /// * - `pit_uses_speaker_port` - does the PIT use port 0x61 for the PC speaker
1829     /// * - `vm_evt_wrtube` - the event object which should receive exit events
setup_legacy_i8042_device( io_bus: &Bus, pit_uses_speaker_port: bool, vm_evt_wrtube: SendTube, ) -> Result<()>1830     pub fn setup_legacy_i8042_device(
1831         io_bus: &Bus,
1832         pit_uses_speaker_port: bool,
1833         vm_evt_wrtube: SendTube,
1834     ) -> Result<()> {
1835         let i8042 = Arc::new(Mutex::new(devices::I8042Device::new(
1836             vm_evt_wrtube.try_clone().map_err(Error::CloneTube)?,
1837         )));
1838 
1839         if pit_uses_speaker_port {
1840             io_bus.insert(i8042, 0x062, 0x3).unwrap();
1841         } else {
1842             io_bus.insert(i8042, 0x061, 0x4).unwrap();
1843         }
1844 
1845         Ok(())
1846     }
1847 
1848     /// Sets up the legacy x86 CMOS/RTC platform device
1849     /// # Arguments
1850     ///
1851     /// * - `io_bus` - the IO bus object
1852     /// * - `mem_size` - the size in bytes of physical ram for the guest
setup_legacy_cmos_device( io_bus: &Bus, irq_chip: &mut dyn IrqChipX86_64, vm_control: Tube, mem_size: u64, ) -> anyhow::Result<()>1853     pub fn setup_legacy_cmos_device(
1854         io_bus: &Bus,
1855         irq_chip: &mut dyn IrqChipX86_64,
1856         vm_control: Tube,
1857         mem_size: u64,
1858     ) -> anyhow::Result<()> {
1859         let mem_regions = arch_memory_regions(mem_size, None);
1860 
1861         let mem_below_4g = mem_regions
1862             .iter()
1863             .filter(|r| r.0.offset() < FIRST_ADDR_PAST_32BITS)
1864             .map(|r| r.1)
1865             .sum();
1866 
1867         let mem_above_4g = mem_regions
1868             .iter()
1869             .filter(|r| r.0.offset() >= FIRST_ADDR_PAST_32BITS)
1870             .map(|r| r.1)
1871             .sum();
1872 
1873         let irq_evt = devices::IrqEdgeEvent::new().context("cmos irq")?;
1874         let cmos = devices::cmos::Cmos::new(
1875             mem_below_4g,
1876             mem_above_4g,
1877             Utc::now,
1878             vm_control,
1879             irq_evt.try_clone().context("cmos irq clone")?,
1880         )
1881         .context("create cmos")?;
1882 
1883         irq_chip
1884             .register_edge_irq_event(
1885                 devices::cmos::RTC_IRQ as u32,
1886                 &irq_evt,
1887                 IrqEventSource::from_device(&cmos),
1888             )
1889             .context("cmos register irq")?;
1890         io_bus
1891             .insert(Arc::new(Mutex::new(cmos)), 0x70, 0x2)
1892             .context("cmos insert irq")?;
1893 
1894         Ok(())
1895     }
1896 
1897     /// Sets up the acpi devices for this platform and
1898     /// return the resources which is used to set the ACPI tables.
1899     ///
1900     /// # Arguments
1901     ///
1902     /// * - `io_bus` the I/O bus to add the devices to
1903     /// * - `resources` the SystemAllocator to allocate IO and MMIO for acpi devices.
1904     /// * - `suspend_evt` the event object which used to suspend the vm
1905     /// * - `sdts` ACPI system description tables
1906     /// * - `irq_chip` the IrqChip object for registering irq events
1907     /// * - `battery` indicate whether to create the battery
1908     /// * - `mmio_bus` the MMIO bus to add the devices to
1909     /// * - `pci_irqs` IRQ assignment of PCI devices. Tuples of (PCI address, gsi, PCI interrupt
1910     ///   pin). Note that this matches one of the return values of generate_pci_root.
setup_acpi_devices( pci_root: Arc<Mutex<PciRoot>>, mem: &GuestMemory, io_bus: &Bus, resources: &mut SystemAllocator, suspend_evt: Event, vm_evt_wrtube: SendTube, sdts: Vec<SDT>, irq_chip: &mut dyn IrqChip, sci_irq: u32, battery: (Option<BatteryType>, Option<Minijail>), #[cfg_attr(windows, allow(unused_variables))] mmio_bus: &Bus, max_bus: u8, resume_notify_devices: &mut Vec<Arc<Mutex<dyn BusResumeDevice>>>, #[cfg(feature = "swap")] swap_controller: &mut Option<swap::SwapController>, #[cfg(any(target_os = "android", target_os = "linux"))] ac_adapter: bool, #[cfg(any(target_os = "android", target_os = "linux"))] guest_suspended_cvar: Option< Arc<(Mutex<bool>, Condvar)>, >, pci_irqs: &[(PciAddress, u32, PciInterruptPin)], ) -> Result<(acpi::AcpiDevResource, Option<BatControl>)>1911     pub fn setup_acpi_devices(
1912         pci_root: Arc<Mutex<PciRoot>>,
1913         mem: &GuestMemory,
1914         io_bus: &Bus,
1915         resources: &mut SystemAllocator,
1916         suspend_evt: Event,
1917         vm_evt_wrtube: SendTube,
1918         sdts: Vec<SDT>,
1919         irq_chip: &mut dyn IrqChip,
1920         sci_irq: u32,
1921         battery: (Option<BatteryType>, Option<Minijail>),
1922         #[cfg_attr(windows, allow(unused_variables))] mmio_bus: &Bus,
1923         max_bus: u8,
1924         resume_notify_devices: &mut Vec<Arc<Mutex<dyn BusResumeDevice>>>,
1925         #[cfg(feature = "swap")] swap_controller: &mut Option<swap::SwapController>,
1926         #[cfg(any(target_os = "android", target_os = "linux"))] ac_adapter: bool,
1927         #[cfg(any(target_os = "android", target_os = "linux"))] guest_suspended_cvar: Option<
1928             Arc<(Mutex<bool>, Condvar)>,
1929         >,
1930         pci_irqs: &[(PciAddress, u32, PciInterruptPin)],
1931     ) -> Result<(acpi::AcpiDevResource, Option<BatControl>)> {
1932         // The AML data for the acpi devices
1933         let mut amls = Vec::new();
1934 
1935         let bat_control = if let Some(battery_type) = battery.0 {
1936             match battery_type {
1937                 #[cfg(any(target_os = "android", target_os = "linux"))]
1938                 BatteryType::Goldfish => {
1939                     let irq_num = resources.allocate_irq().ok_or(Error::CreateBatDevices(
1940                         arch::DeviceRegistrationError::AllocateIrq,
1941                     ))?;
1942                     let (control_tube, _mmio_base) = arch::sys::linux::add_goldfish_battery(
1943                         &mut amls,
1944                         battery.1,
1945                         mmio_bus,
1946                         irq_chip,
1947                         irq_num,
1948                         resources,
1949                         #[cfg(feature = "swap")]
1950                         swap_controller,
1951                     )
1952                     .map_err(Error::CreateBatDevices)?;
1953                     Some(BatControl {
1954                         type_: BatteryType::Goldfish,
1955                         control_tube,
1956                     })
1957                 }
1958                 #[cfg(windows)]
1959                 _ => None,
1960             }
1961         } else {
1962             None
1963         };
1964 
1965         let pm_alloc = resources.get_anon_alloc();
1966         let pm_iobase = match resources.io_allocator() {
1967             Some(io) => io
1968                 .allocate_with_align(
1969                     devices::acpi::ACPIPM_RESOURCE_LEN as u64,
1970                     pm_alloc,
1971                     "ACPIPM".to_string(),
1972                     4, // must be 32-bit aligned
1973                 )
1974                 .map_err(Error::AllocateIOResouce)?,
1975             None => 0x600,
1976         };
1977 
1978         let pcie_vcfg = aml::Name::new(
1979             "VCFG".into(),
1980             &Self::get_pcie_vcfg_mmio_range(mem, &read_pcie_cfg_mmio()).start,
1981         );
1982         pcie_vcfg.to_aml_bytes(&mut amls);
1983 
1984         let pm_sci_evt = devices::IrqLevelEvent::new().map_err(Error::CreateEvent)?;
1985 
1986         #[cfg(any(target_os = "android", target_os = "linux"))]
1987         let acdc = if ac_adapter {
1988             // Allocate GPE for AC adapter notfication
1989             let gpe = resources.allocate_gpe().ok_or(Error::AllocateGpe)?;
1990 
1991             let alloc = resources.get_anon_alloc();
1992             let mmio_base = resources
1993                 .allocate_mmio(
1994                     devices::ac_adapter::ACDC_VIRT_MMIO_SIZE,
1995                     alloc,
1996                     "AcAdapter".to_string(),
1997                     resources::AllocOptions::new().align(devices::ac_adapter::ACDC_VIRT_MMIO_SIZE),
1998                 )
1999                 .unwrap();
2000             let ac_adapter_dev = devices::ac_adapter::AcAdapter::new(mmio_base, gpe);
2001             let ac_dev = Arc::new(Mutex::new(ac_adapter_dev));
2002             mmio_bus
2003                 .insert(
2004                     ac_dev.clone(),
2005                     mmio_base,
2006                     devices::ac_adapter::ACDC_VIRT_MMIO_SIZE,
2007                 )
2008                 .unwrap();
2009 
2010             ac_dev.lock().to_aml_bytes(&mut amls);
2011             Some(ac_dev)
2012         } else {
2013             None
2014         };
2015         #[cfg(windows)]
2016         let acdc = None;
2017 
2018         //Virtual PMC
2019         #[cfg(any(target_os = "android", target_os = "linux"))]
2020         if let Some(guest_suspended_cvar) = guest_suspended_cvar {
2021             let alloc = resources.get_anon_alloc();
2022             let mmio_base = resources
2023                 .allocate_mmio(
2024                     devices::pmc_virt::VPMC_VIRT_MMIO_SIZE,
2025                     alloc,
2026                     "VirtualPmc".to_string(),
2027                     resources::AllocOptions::new().align(devices::pmc_virt::VPMC_VIRT_MMIO_SIZE),
2028                 )
2029                 .unwrap();
2030 
2031             let pmc_virtio_mmio =
2032                 Arc::new(Mutex::new(VirtualPmc::new(mmio_base, guest_suspended_cvar)));
2033             mmio_bus
2034                 .insert(
2035                     pmc_virtio_mmio.clone(),
2036                     mmio_base,
2037                     devices::pmc_virt::VPMC_VIRT_MMIO_SIZE,
2038                 )
2039                 .unwrap();
2040             pmc_virtio_mmio.lock().to_aml_bytes(&mut amls);
2041         }
2042 
2043         let mut pmresource = devices::ACPIPMResource::new(
2044             pm_sci_evt.try_clone().map_err(Error::CloneEvent)?,
2045             suspend_evt,
2046             vm_evt_wrtube,
2047             acdc,
2048         );
2049         pmresource.to_aml_bytes(&mut amls);
2050         irq_chip
2051             .register_level_irq_event(
2052                 sci_irq,
2053                 &pm_sci_evt,
2054                 IrqEventSource::from_device(&pmresource),
2055             )
2056             .map_err(Error::RegisterIrqfd)?;
2057         pmresource.start();
2058 
2059         let mut crs_entries: Vec<Box<dyn Aml>> = vec![
2060             Box::new(aml::AddressSpace::new_bus_number(0x0u16, max_bus as u16)),
2061             Box::new(aml::IO::new(0xcf8, 0xcf8, 1, 0x8)),
2062         ];
2063         for r in resources.mmio_pools() {
2064             let entry: Box<dyn Aml> = match (u32::try_from(r.start), u32::try_from(r.end)) {
2065                 (Ok(start), Ok(end)) => Box::new(aml::AddressSpace::new_memory(
2066                     aml::AddressSpaceCachable::NotCacheable,
2067                     true,
2068                     start,
2069                     end,
2070                 )),
2071                 _ => Box::new(aml::AddressSpace::new_memory(
2072                     aml::AddressSpaceCachable::NotCacheable,
2073                     true,
2074                     r.start,
2075                     r.end,
2076                 )),
2077             };
2078             crs_entries.push(entry);
2079         }
2080 
2081         let prt_entries: Vec<aml::Package> = pci_irqs
2082             .iter()
2083             .map(|(pci_address, gsi, pci_intr_pin)| {
2084                 aml::Package::new(vec![
2085                     &pci_address.acpi_adr(),
2086                     &pci_intr_pin.to_mask(),
2087                     &aml::ZERO,
2088                     gsi,
2089                 ])
2090             })
2091             .collect();
2092 
2093         aml::Device::new(
2094             "_SB_.PC00".into(),
2095             vec![
2096                 &aml::Name::new("_HID".into(), &aml::EISAName::new("PNP0A08")),
2097                 &aml::Name::new("_CID".into(), &aml::EISAName::new("PNP0A03")),
2098                 &aml::Name::new("_ADR".into(), &aml::ZERO),
2099                 &aml::Name::new("_SEG".into(), &aml::ZERO),
2100                 &aml::Name::new("_UID".into(), &aml::ZERO),
2101                 &aml::Name::new("SUPP".into(), &aml::ZERO),
2102                 &aml::Name::new(
2103                     "_CRS".into(),
2104                     &aml::ResourceTemplate::new(crs_entries.iter().map(|b| b.as_ref()).collect()),
2105                 ),
2106                 &PciRootOSC {},
2107                 &aml::Name::new(
2108                     "_PRT".into(),
2109                     &aml::Package::new(prt_entries.iter().map(|p| p as &dyn Aml).collect()),
2110                 ),
2111             ],
2112         )
2113         .to_aml_bytes(&mut amls);
2114 
2115         if let (Some(start), Some(len)) = (
2116             u32::try_from(read_pcie_cfg_mmio().start).ok(),
2117             read_pcie_cfg_mmio()
2118                 .len()
2119                 .and_then(|l| u32::try_from(l).ok()),
2120         ) {
2121             aml::Device::new(
2122                 "_SB_.MB00".into(),
2123                 vec![
2124                     &aml::Name::new("_HID".into(), &aml::EISAName::new("PNP0C02")),
2125                     &aml::Name::new(
2126                         "_CRS".into(),
2127                         &aml::ResourceTemplate::new(vec![&aml::Memory32Fixed::new(
2128                             true, start, len,
2129                         )]),
2130                     ),
2131                 ],
2132             )
2133             .to_aml_bytes(&mut amls);
2134         } else {
2135             warn!("Failed to create ACPI MMCFG region reservation");
2136         }
2137 
2138         let root_bus = pci_root.lock().get_root_bus();
2139         let addresses = root_bus.lock().get_downstream_devices();
2140         for address in addresses {
2141             if let Some(acpi_path) = pci_root.lock().acpi_path(&address) {
2142                 const DEEPEST_SLEEP_STATE: u32 = 3;
2143                 aml::Device::new(
2144                     (*acpi_path).into(),
2145                     vec![
2146                         &aml::Name::new("_ADR".into(), &address.acpi_adr()),
2147                         &aml::Name::new(
2148                             "_PRW".into(),
2149                             &aml::Package::new(vec![&PM_WAKEUP_GPIO, &DEEPEST_SLEEP_STATE]),
2150                         ),
2151                     ],
2152                 )
2153                 .to_aml_bytes(&mut amls);
2154             }
2155         }
2156 
2157         let pm = Arc::new(Mutex::new(pmresource));
2158         io_bus
2159             .insert(
2160                 pm.clone(),
2161                 pm_iobase,
2162                 devices::acpi::ACPIPM_RESOURCE_LEN as u64,
2163             )
2164             .unwrap();
2165         resume_notify_devices.push(pm.clone());
2166 
2167         Ok((
2168             acpi::AcpiDevResource {
2169                 amls,
2170                 pm_iobase,
2171                 pm,
2172                 sdts,
2173             },
2174             bat_control,
2175         ))
2176     }
2177 
2178     /// Sets up the serial devices for this platform. Returns a list of configured serial devices.
2179     ///
2180     /// # Arguments
2181     ///
2182     /// * - `irq_chip` the IrqChip object for registering irq events
2183     /// * - `io_bus` the I/O bus to add the devices to
2184     /// * - `serial_parameters` - definitions for how the serial devices should be configured
setup_serial_devices( protection_type: ProtectionType, irq_chip: &mut dyn IrqChip, io_bus: &Bus, serial_parameters: &BTreeMap<(SerialHardware, u8), SerialParameters>, serial_jail: Option<Minijail>, #[cfg(feature = "swap")] swap_controller: &mut Option<swap::SwapController>, ) -> Result<Vec<SerialDeviceInfo>>2185     pub fn setup_serial_devices(
2186         protection_type: ProtectionType,
2187         irq_chip: &mut dyn IrqChip,
2188         io_bus: &Bus,
2189         serial_parameters: &BTreeMap<(SerialHardware, u8), SerialParameters>,
2190         serial_jail: Option<Minijail>,
2191         #[cfg(feature = "swap")] swap_controller: &mut Option<swap::SwapController>,
2192     ) -> Result<Vec<SerialDeviceInfo>> {
2193         let com_evt_1_3 = devices::IrqEdgeEvent::new().map_err(Error::CreateEvent)?;
2194         let com_evt_2_4 = devices::IrqEdgeEvent::new().map_err(Error::CreateEvent)?;
2195 
2196         let serial_devices = arch::add_serial_devices(
2197             protection_type,
2198             io_bus,
2199             (X86_64_SERIAL_1_3_IRQ, com_evt_1_3.get_trigger()),
2200             (X86_64_SERIAL_2_4_IRQ, com_evt_2_4.get_trigger()),
2201             serial_parameters,
2202             serial_jail,
2203             #[cfg(feature = "swap")]
2204             swap_controller,
2205         )
2206         .map_err(Error::CreateSerialDevices)?;
2207 
2208         let source = IrqEventSource {
2209             device_id: Serial::device_id(),
2210             queue_id: 0,
2211             device_name: Serial::debug_label(),
2212         };
2213         irq_chip
2214             .register_edge_irq_event(X86_64_SERIAL_1_3_IRQ, &com_evt_1_3, source.clone())
2215             .map_err(Error::RegisterIrqfd)?;
2216         irq_chip
2217             .register_edge_irq_event(X86_64_SERIAL_2_4_IRQ, &com_evt_2_4, source)
2218             .map_err(Error::RegisterIrqfd)?;
2219 
2220         Ok(serial_devices)
2221     }
2222 
setup_debugcon_devices( protection_type: ProtectionType, io_bus: &Bus, serial_parameters: &BTreeMap<(SerialHardware, u8), SerialParameters>, debugcon_jail: Option<Minijail>, #[cfg(feature = "swap")] swap_controller: &mut Option<swap::SwapController>, ) -> Result<()>2223     fn setup_debugcon_devices(
2224         protection_type: ProtectionType,
2225         io_bus: &Bus,
2226         serial_parameters: &BTreeMap<(SerialHardware, u8), SerialParameters>,
2227         debugcon_jail: Option<Minijail>,
2228         #[cfg(feature = "swap")] swap_controller: &mut Option<swap::SwapController>,
2229     ) -> Result<()> {
2230         for param in serial_parameters.values() {
2231             if param.hardware != SerialHardware::Debugcon {
2232                 continue;
2233             }
2234 
2235             let mut preserved_fds = Vec::new();
2236             let con = param
2237                 .create_serial_device::<Debugcon>(
2238                     protection_type,
2239                     // Debugcon doesn't use the interrupt event
2240                     &Event::new().map_err(Error::CreateEvent)?,
2241                     &mut preserved_fds,
2242                 )
2243                 .map_err(Error::CreateDebugconDevice)?;
2244 
2245             let con: Arc<Mutex<dyn BusDevice>> = match debugcon_jail.as_ref() {
2246                 #[cfg(any(target_os = "android", target_os = "linux"))]
2247                 Some(jail) => {
2248                     let jail_clone = jail.try_clone().map_err(Error::CloneJail)?;
2249                     #[cfg(feature = "seccomp_trace")]
2250                     debug!(
2251                         "seccomp_trace {{\"event\": \"minijail_clone\", \"src_jail_addr\": \"0x{:x}\", \"dst_jail_addr\": \"0x{:x}\"}}",
2252                         read_jail_addr(jail),
2253                         read_jail_addr(&jail_clone)
2254                     );
2255                     Arc::new(Mutex::new(
2256                         ProxyDevice::new(
2257                             con,
2258                             jail_clone,
2259                             preserved_fds,
2260                             #[cfg(feature = "swap")]
2261                             swap_controller,
2262                         )
2263                         .map_err(Error::CreateProxyDevice)?,
2264                     ))
2265                 }
2266                 #[cfg(windows)]
2267                 Some(_) => unreachable!(),
2268                 None => Arc::new(Mutex::new(con)),
2269             };
2270             io_bus
2271                 .insert(con.clone(), param.debugcon_port.into(), 1)
2272                 .map_err(Error::InsertBus)?;
2273         }
2274 
2275         Ok(())
2276     }
2277 }
2278 
2279 #[sorted]
2280 #[derive(Error, Debug)]
2281 pub enum MsrError {
2282     #[error("CPU not support. Only intel CPUs support ITMT.")]
2283     CpuUnSupport,
2284     #[error("msr must be unique: {0}")]
2285     MsrDuplicate(u32),
2286 }
2287 
2288 #[derive(Error, Debug)]
2289 pub enum HybridSupportError {
2290     #[error("Host CPU doesn't support hybrid architecture.")]
2291     UnsupportedHostCpu,
2292 }
2293 
2294 /// The wrapper for CPUID call functions.
2295 pub struct CpuIdCall {
2296     /// __cpuid_count or a fake function for test.
2297     cpuid_count: unsafe fn(u32, u32) -> CpuidResult,
2298     /// __cpuid or a fake function for test.
2299     cpuid: unsafe fn(u32) -> CpuidResult,
2300 }
2301 
2302 impl CpuIdCall {
new( cpuid_count: unsafe fn(u32, u32) -> CpuidResult, cpuid: unsafe fn(u32) -> CpuidResult, ) -> CpuIdCall2303     pub fn new(
2304         cpuid_count: unsafe fn(u32, u32) -> CpuidResult,
2305         cpuid: unsafe fn(u32) -> CpuidResult,
2306     ) -> CpuIdCall {
2307         CpuIdCall { cpuid_count, cpuid }
2308     }
2309 }
2310 
2311 /// Check if host supports hybrid CPU feature. The check include:
2312 ///     1. Check if CPUID.1AH exists. CPUID.1AH is hybrid information enumeration leaf.
2313 ///     2. Check if CPUID.07H.00H:EDX[bit 15] sets. This bit means the processor is identified as a
2314 ///        hybrid part.
2315 ///     3. Check if CPUID.1AH:EAX sets. The hybrid core type is set in EAX.
2316 ///
2317 /// # Arguments
2318 ///
2319 /// * - `cpuid` the wrapped cpuid functions used to get CPUID info.
check_host_hybrid_support(cpuid: &CpuIdCall) -> std::result::Result<(), HybridSupportError>2320 pub fn check_host_hybrid_support(cpuid: &CpuIdCall) -> std::result::Result<(), HybridSupportError> {
2321     // CPUID.0H.EAX returns maximum input value for basic CPUID information.
2322     //
2323     // SAFETY:
2324     // Safe because we pass 0 for this call and the host supports the
2325     // `cpuid` instruction.
2326     let mut cpuid_entry = unsafe { (cpuid.cpuid)(0x0) };
2327     if cpuid_entry.eax < 0x1A {
2328         return Err(HybridSupportError::UnsupportedHostCpu);
2329     }
2330     // SAFETY:
2331     // Safe because we pass 0x7 and 0 for this call and the host supports the
2332     // `cpuid` instruction.
2333     cpuid_entry = unsafe { (cpuid.cpuid_count)(0x7, 0) };
2334     if cpuid_entry.edx & 1 << EDX_HYBRID_CPU_SHIFT == 0 {
2335         return Err(HybridSupportError::UnsupportedHostCpu);
2336     }
2337     // From SDM, if a value entered for CPUID.EAX is less than or equal to the
2338     // maximum input value and the leaf is not supported on that processor then
2339     // 0 is returned in all the registers.
2340     // For the CPU with hybrid support, its CPUID.1AH.EAX shouldn't be zero.
2341     //
2342     // SAFETY:
2343     // Safe because we pass 0 for this call and the host supports the
2344     // `cpuid` instruction.
2345     cpuid_entry = unsafe { (cpuid.cpuid)(0x1A) };
2346     if cpuid_entry.eax == 0 {
2347         return Err(HybridSupportError::UnsupportedHostCpu);
2348     }
2349     Ok(())
2350 }
2351 
2352 #[cfg(test)]
2353 mod tests {
2354     use std::mem::size_of;
2355 
2356     use super::*;
2357 
2358     const TEST_MEMORY_SIZE: u64 = 2 * GB;
2359 
setup()2360     fn setup() {
2361         let pcie_ecam = Some(AddressRange::from_start_and_size(3 * GB, 256 * MB).unwrap());
2362         let pci_start = Some(2 * GB);
2363         init_low_memory_layout(pcie_ecam, pci_start);
2364     }
2365 
2366     #[test]
regions_lt_4gb_nobios()2367     fn regions_lt_4gb_nobios() {
2368         setup();
2369         let regions = arch_memory_regions(512 * MB, /* bios_size */ None);
2370         assert_eq!(1, regions.len());
2371         assert_eq!(GuestAddress(START_OF_RAM_32BITS), regions[0].0);
2372         assert_eq!(1u64 << 29, regions[0].1);
2373     }
2374 
2375     #[test]
regions_gt_4gb_nobios()2376     fn regions_gt_4gb_nobios() {
2377         setup();
2378         let size = 4 * GB + 0x8000;
2379         let regions = arch_memory_regions(size, /* bios_size */ None);
2380         assert_eq!(2, regions.len());
2381         assert_eq!(GuestAddress(START_OF_RAM_32BITS), regions[0].0);
2382         assert_eq!(GuestAddress(4 * GB), regions[1].0);
2383         assert_eq!(4 * GB + 0x8000, regions[0].1 + regions[1].1);
2384     }
2385 
2386     #[test]
regions_lt_4gb_bios()2387     fn regions_lt_4gb_bios() {
2388         setup();
2389         let bios_len = 1 * MB;
2390         let regions = arch_memory_regions(512 * MB, Some(bios_len));
2391         assert_eq!(2, regions.len());
2392         assert_eq!(GuestAddress(START_OF_RAM_32BITS), regions[0].0);
2393         assert_eq!(512 * MB, regions[0].1);
2394         assert_eq!(
2395             GuestAddress(FIRST_ADDR_PAST_32BITS - bios_len),
2396             regions[1].0
2397         );
2398         assert_eq!(bios_len, regions[1].1);
2399     }
2400 
2401     #[test]
regions_gt_4gb_bios()2402     fn regions_gt_4gb_bios() {
2403         setup();
2404         let bios_len = 1 * MB;
2405         let regions = arch_memory_regions(4 * GB + 0x8000, Some(bios_len));
2406         assert_eq!(3, regions.len());
2407         assert_eq!(GuestAddress(START_OF_RAM_32BITS), regions[0].0);
2408         assert_eq!(
2409             GuestAddress(FIRST_ADDR_PAST_32BITS - bios_len),
2410             regions[1].0
2411         );
2412         assert_eq!(bios_len, regions[1].1);
2413         assert_eq!(GuestAddress(4 * GB), regions[2].0);
2414     }
2415 
2416     #[test]
regions_eq_4gb_nobios()2417     fn regions_eq_4gb_nobios() {
2418         setup();
2419         // Test with exact size of 4GB - the overhead.
2420         let regions = arch_memory_regions(
2421             TEST_MEMORY_SIZE - START_OF_RAM_32BITS,
2422             /* bios_size */ None,
2423         );
2424         dbg!(&regions);
2425         assert_eq!(1, regions.len());
2426         assert_eq!(GuestAddress(START_OF_RAM_32BITS), regions[0].0);
2427         assert_eq!(TEST_MEMORY_SIZE - START_OF_RAM_32BITS, regions[0].1);
2428     }
2429 
2430     #[test]
regions_eq_4gb_bios()2431     fn regions_eq_4gb_bios() {
2432         setup();
2433         // Test with exact size of 4GB - the overhead.
2434         let bios_len = 1 * MB;
2435         let regions = arch_memory_regions(TEST_MEMORY_SIZE - START_OF_RAM_32BITS, Some(bios_len));
2436         assert_eq!(2, regions.len());
2437         assert_eq!(GuestAddress(START_OF_RAM_32BITS), regions[0].0);
2438         assert_eq!(TEST_MEMORY_SIZE - START_OF_RAM_32BITS, regions[0].1);
2439         assert_eq!(
2440             GuestAddress(FIRST_ADDR_PAST_32BITS - bios_len),
2441             regions[1].0
2442         );
2443         assert_eq!(bios_len, regions[1].1);
2444     }
2445 
2446     #[test]
check_pci_mmio_layout()2447     fn check_pci_mmio_layout() {
2448         setup();
2449 
2450         assert_eq!(read_pci_mmio_before_32bit().start, 2 * GB);
2451         assert_eq!(read_pcie_cfg_mmio().start, 3 * GB);
2452         assert_eq!(read_pcie_cfg_mmio().len().unwrap(), 256 * MB);
2453     }
2454 
2455     #[test]
check_32bit_gap_size_alignment()2456     fn check_32bit_gap_size_alignment() {
2457         setup();
2458         // pci_low_start is 256 MB aligned to be friendly for MTRR mappings.
2459         assert_eq!(read_pci_mmio_before_32bit().start % (256 * MB), 0);
2460     }
2461 
2462     #[test]
write_setup_data_empty()2463     fn write_setup_data_empty() {
2464         let mem = GuestMemory::new(&[(GuestAddress(0), 0x2_0000)]).unwrap();
2465         let setup_data = [];
2466         let setup_data_addr = write_setup_data(
2467             &mem,
2468             GuestAddress(0x1000),
2469             GuestAddress(0x2000),
2470             &setup_data,
2471         )
2472         .expect("write_setup_data");
2473         assert_eq!(setup_data_addr, None);
2474     }
2475 
2476     #[test]
write_setup_data_two_of_them()2477     fn write_setup_data_two_of_them() {
2478         let mem = GuestMemory::new(&[(GuestAddress(0), 0x2_0000)]).unwrap();
2479 
2480         let entry1_addr = GuestAddress(0x1000);
2481         let entry1_next_addr = entry1_addr;
2482         let entry1_len_addr = entry1_addr.checked_add(12).unwrap();
2483         let entry1_data_addr = entry1_addr.checked_add(16).unwrap();
2484         let entry1_data = [0x55u8; 13];
2485         let entry1_size = (size_of::<setup_data_hdr>() + entry1_data.len()) as u64;
2486         let entry1_align = 3;
2487 
2488         let entry2_addr = GuestAddress(entry1_addr.offset() + entry1_size + entry1_align);
2489         let entry2_next_addr = entry2_addr;
2490         let entry2_len_addr = entry2_addr.checked_add(12).unwrap();
2491         let entry2_data_addr = entry2_addr.checked_add(16).unwrap();
2492         let entry2_data = [0xAAu8; 9];
2493 
2494         let setup_data = [
2495             SetupData {
2496                 data: entry1_data.to_vec(),
2497                 type_: SetupDataType::Dtb,
2498             },
2499             SetupData {
2500                 data: entry2_data.to_vec(),
2501                 type_: SetupDataType::Dtb,
2502             },
2503         ];
2504 
2505         let setup_data_head_addr = write_setup_data(
2506             &mem,
2507             GuestAddress(0x1000),
2508             GuestAddress(0x2000),
2509             &setup_data,
2510         )
2511         .expect("write_setup_data");
2512         assert_eq!(setup_data_head_addr, Some(entry1_addr));
2513 
2514         assert_eq!(
2515             mem.read_obj_from_addr::<u64>(entry1_next_addr).unwrap(),
2516             entry2_addr.offset()
2517         );
2518         assert_eq!(
2519             mem.read_obj_from_addr::<u32>(entry1_len_addr).unwrap(),
2520             entry1_data.len() as u32
2521         );
2522         assert_eq!(
2523             mem.read_obj_from_addr::<[u8; 13]>(entry1_data_addr)
2524                 .unwrap(),
2525             entry1_data
2526         );
2527 
2528         assert_eq!(mem.read_obj_from_addr::<u64>(entry2_next_addr).unwrap(), 0);
2529         assert_eq!(
2530             mem.read_obj_from_addr::<u32>(entry2_len_addr).unwrap(),
2531             entry2_data.len() as u32
2532         );
2533         assert_eq!(
2534             mem.read_obj_from_addr::<[u8; 9]>(entry2_data_addr).unwrap(),
2535             entry2_data
2536         );
2537     }
2538 }
2539