• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2017 The Chromium OS Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #![cfg(any(target_arch = "x86", target_arch = "x86_64"))]
6 
7 mod fdt;
8 
9 const SETUP_DTB: u32 = 2;
10 const X86_64_FDT_MAX_SIZE: u64 = 0x20_0000;
11 
12 #[allow(dead_code)]
13 #[allow(non_upper_case_globals)]
14 #[allow(non_camel_case_types)]
15 #[allow(non_snake_case)]
16 mod bootparam;
17 
18 // boot_params is just a series of ints, it is safe to initialize it.
19 unsafe impl data_model::DataInit for bootparam::boot_params {}
20 
21 #[allow(dead_code)]
22 #[allow(non_upper_case_globals)]
23 mod msr_index;
24 
25 #[allow(dead_code)]
26 #[allow(non_upper_case_globals)]
27 #[allow(non_camel_case_types)]
28 #[allow(clippy::all)]
29 mod mpspec;
30 // These mpspec types are only data, reading them from data is a safe initialization.
31 unsafe impl data_model::DataInit for mpspec::mpc_bus {}
32 unsafe impl data_model::DataInit for mpspec::mpc_cpu {}
33 unsafe impl data_model::DataInit for mpspec::mpc_intsrc {}
34 unsafe impl data_model::DataInit for mpspec::mpc_ioapic {}
35 unsafe impl data_model::DataInit for mpspec::mpc_table {}
36 unsafe impl data_model::DataInit for mpspec::mpc_lintsrc {}
37 unsafe impl data_model::DataInit for mpspec::mpf_intel {}
38 
39 mod acpi;
40 mod bzimage;
41 mod cpuid;
42 mod gdt;
43 mod interrupts;
44 mod mptable;
45 mod regs;
46 mod smbios;
47 
48 use std::collections::BTreeMap;
49 use std::convert::TryFrom;
50 use std::ffi::{CStr, CString};
51 use std::fs::File;
52 use std::io::{self, Seek};
53 use std::mem;
54 use std::sync::Arc;
55 
56 use crate::bootparam::boot_params;
57 use acpi_tables::sdt::SDT;
58 use acpi_tables::{aml, aml::Aml};
59 use arch::{get_serial_cmdline, GetSerialCmdlineError, RunnableLinuxVm, VmComponents, VmImage};
60 use base::{warn, Event};
61 use devices::serial_device::{SerialHardware, SerialParameters};
62 use devices::{
63     BusDeviceObj, BusResumeDevice, IrqChip, IrqChipX86_64, PciAddress, PciConfigIo, PciConfigMmio,
64     PciDevice, PciVirtualConfigMmio,
65 };
66 use hypervisor::{HypervisorX86_64, ProtectionType, VcpuX86_64, Vm, VmX86_64};
67 use minijail::Minijail;
68 use remain::sorted;
69 use resources::{MemRegion, SystemAllocator, SystemAllocatorConfig};
70 use sync::Mutex;
71 use thiserror::Error;
72 use vm_control::{BatControl, BatteryType};
73 use vm_memory::{GuestAddress, GuestMemory, GuestMemoryError};
74 #[cfg(all(target_arch = "x86_64", feature = "gdb"))]
75 use {
76     gdbstub_arch::x86::reg::{X86SegmentRegs, X86_64CoreRegs},
77     hypervisor::x86_64::{Regs, Sregs},
78 };
79 
80 #[sorted]
81 #[derive(Error, Debug)]
82 pub enum Error {
83     #[error("error allocating IO resource: {0}")]
84     AllocateIOResouce(resources::Error),
85     #[error("error allocating a single irq")]
86     AllocateIrq,
87     #[error("unable to clone an Event: {0}")]
88     CloneEvent(base::Error),
89     #[error("failed to clone IRQ chip: {0}")]
90     CloneIrqChip(base::Error),
91     #[error("the given kernel command line was invalid: {0}")]
92     Cmdline(kernel_cmdline::Error),
93     #[error("failed to configure hotplugged pci device: {0}")]
94     ConfigurePciDevice(arch::DeviceRegistrationError),
95     #[error("error configuring the system")]
96     ConfigureSystem,
97     #[error("unable to create ACPI tables")]
98     CreateAcpi,
99     #[error("unable to create battery devices: {0}")]
100     CreateBatDevices(arch::DeviceRegistrationError),
101     #[error("unable to make an Event: {0}")]
102     CreateEvent(base::Error),
103     #[error("failed to create fdt: {0}")]
104     CreateFdt(arch::fdt::Error),
105     #[cfg(feature = "direct")]
106     #[error("failed to enable GPE forwarding: {0}")]
107     CreateGpe(devices::DirectIrqError),
108     #[error("failed to create IOAPIC device: {0}")]
109     CreateIoapicDevice(base::Error),
110     #[error("failed to create a PCI root hub: {0}")]
111     CreatePciRoot(arch::DeviceRegistrationError),
112     #[error("unable to create PIT: {0}")]
113     CreatePit(base::Error),
114     #[error("unable to make PIT device: {0}")]
115     CreatePitDevice(devices::PitError),
116     #[error("unable to create serial devices: {0}")]
117     CreateSerialDevices(arch::DeviceRegistrationError),
118     #[error("failed to create socket: {0}")]
119     CreateSocket(io::Error),
120     #[error("failed to create VCPU: {0}")]
121     CreateVcpu(base::Error),
122     #[error("invalid e820 setup params")]
123     E820Configuration,
124     #[error("failed to enable singlestep execution: {0}")]
125     EnableSinglestep(base::Error),
126     #[error("failed to enable split irqchip: {0}")]
127     EnableSplitIrqchip(base::Error),
128     #[error("failed to get serial cmdline: {0}")]
129     GetSerialCmdline(GetSerialCmdlineError),
130     #[error("the kernel extends past the end of RAM")]
131     KernelOffsetPastEnd,
132     #[error("error loading bios: {0}")]
133     LoadBios(io::Error),
134     #[error("error loading kernel bzImage: {0}")]
135     LoadBzImage(bzimage::Error),
136     #[error("error loading command line: {0}")]
137     LoadCmdline(kernel_loader::Error),
138     #[error("error loading initrd: {0}")]
139     LoadInitrd(arch::LoadImageError),
140     #[error("error loading Kernel: {0}")]
141     LoadKernel(kernel_loader::Error),
142     #[error("error translating address: Page not present")]
143     PageNotPresent,
144     #[error("error reading guest memory {0}")]
145     ReadingGuestMemory(vm_memory::GuestMemoryError),
146     #[error("error reading CPU registers {0}")]
147     ReadRegs(base::Error),
148     #[error("error registering an IrqFd: {0}")]
149     RegisterIrqfd(base::Error),
150     #[error("error registering virtual socket device: {0}")]
151     RegisterVsock(arch::DeviceRegistrationError),
152     #[error("failed to set a hardware breakpoint: {0}")]
153     SetHwBreakpoint(base::Error),
154     #[error("failed to set interrupts: {0}")]
155     SetLint(interrupts::Error),
156     #[error("failed to set tss addr: {0}")]
157     SetTssAddr(base::Error),
158     #[error("failed to set up cpuid: {0}")]
159     SetupCpuid(cpuid::Error),
160     #[error("failed to set up FPU: {0}")]
161     SetupFpu(regs::Error),
162     #[error("failed to set up guest memory: {0}")]
163     SetupGuestMemory(GuestMemoryError),
164     #[error("failed to set up mptable: {0}")]
165     SetupMptable(mptable::Error),
166     #[error("failed to set up MSRs: {0}")]
167     SetupMsrs(regs::Error),
168     #[error("failed to set up registers: {0}")]
169     SetupRegs(regs::Error),
170     #[error("failed to set up SMBIOS: {0}")]
171     SetupSmbios(smbios::Error),
172     #[error("failed to set up sregs: {0}")]
173     SetupSregs(regs::Error),
174     #[error("failed to translate virtual address")]
175     TranslatingVirtAddr,
176     #[error("protected VMs not supported on x86_64")]
177     UnsupportedProtectionType,
178     #[error("error writing CPU registers {0}")]
179     WriteRegs(base::Error),
180     #[error("error writing guest memory {0}")]
181     WritingGuestMemory(GuestMemoryError),
182     #[error("the zero page extends past the end of guest_mem")]
183     ZeroPagePastRamEnd,
184     #[error("error writing the zero page of guest memory")]
185     ZeroPageSetup,
186 }
187 
188 pub type Result<T> = std::result::Result<T, Error>;
189 
190 pub struct X8664arch;
191 
192 enum E820Type {
193     Ram = 0x01,
194     Reserved = 0x2,
195 }
196 
197 const MB: u64 = 1 << 20;
198 const GB: u64 = 1 << 30;
199 
200 const BOOT_STACK_POINTER: u64 = 0x8000;
201 // Make sure it align to 256MB for MTRR convenient
202 const MEM_32BIT_GAP_SIZE: u64 = if cfg!(feature = "direct") {
203     // Allow space for identity mapping coreboot memory regions on the host
204     // which is found at around 7a00_0000 (little bit before 2GB)
205     //
206     // TODO(b/188011323): stop hardcoding sizes and addresses here and instead
207     // determine the memory map from how the VM has been configured via the
208     // command line.
209     2560 * MB
210 } else {
211     768 * MB
212 };
213 const START_OF_RAM_32BITS: u64 = if cfg!(feature = "direct") { 0x1000 } else { 0 };
214 const FIRST_ADDR_PAST_32BITS: u64 = 1 << 32;
215 // Reserved memory for nand_bios/LAPIC/IOAPIC/HPET/.....
216 const RESERVED_MEM_SIZE: u64 = 0x800_0000;
217 // Reserve 64MB for pcie enhanced configuration
218 const PCIE_CFG_MMIO_SIZE: u64 = 0x400_0000;
219 const PCIE_CFG_MMIO_START: u64 = FIRST_ADDR_PAST_32BITS - RESERVED_MEM_SIZE - PCIE_CFG_MMIO_SIZE;
220 // Reserve memory region for pcie virtual configuration
221 const PCIE_VCFG_MMIO_SIZE: u64 = PCIE_CFG_MMIO_SIZE;
222 const END_ADDR_BEFORE_32BITS: u64 = FIRST_ADDR_PAST_32BITS - MEM_32BIT_GAP_SIZE;
223 const PCI_MMIO_SIZE: u64 = MEM_32BIT_GAP_SIZE - RESERVED_MEM_SIZE - PCIE_CFG_MMIO_SIZE;
224 // Linux (with 4-level paging) has a physical memory limit of 46 bits (64 TiB).
225 const HIGH_MMIO_MAX_END: u64 = 1u64 << 46;
226 const KERNEL_64BIT_ENTRY_OFFSET: u64 = 0x200;
227 const ZERO_PAGE_OFFSET: u64 = 0x7000;
228 const TSS_ADDR: u64 = 0xfffb_d000;
229 
230 const KERNEL_START_OFFSET: u64 = 0x20_0000;
231 const CMDLINE_OFFSET: u64 = 0x2_0000;
232 const CMDLINE_MAX_SIZE: u64 = KERNEL_START_OFFSET - CMDLINE_OFFSET;
233 const X86_64_SERIAL_1_3_IRQ: u32 = 4;
234 const X86_64_SERIAL_2_4_IRQ: u32 = 3;
235 // X86_64_SCI_IRQ is used to fill the ACPI FACP table.
236 // The sci_irq number is better to be a legacy
237 // IRQ number which is less than 16(actually most of the
238 // platforms have fixed IRQ number 9). So we can
239 // reserve the IRQ number 5 for SCI and let the
240 // the other devices starts from next.
241 pub const X86_64_SCI_IRQ: u32 = 5;
242 // The CMOS RTC uses IRQ 8; start allocating IRQs at 9.
243 pub const X86_64_IRQ_BASE: u32 = 9;
244 const ACPI_HI_RSDP_WINDOW_BASE: u64 = 0x000E_0000;
245 
246 /// The x86 reset vector for i386+ and x86_64 puts the processor into an "unreal mode" where it
247 /// can access the last 1 MB of the 32-bit address space in 16-bit mode, and starts the instruction
248 /// pointer at the effective physical address 0xFFFF_FFF0.
bios_start(bios_size: u64) -> GuestAddress249 fn bios_start(bios_size: u64) -> GuestAddress {
250     GuestAddress(FIRST_ADDR_PAST_32BITS - bios_size)
251 }
252 
configure_system( guest_mem: &GuestMemory, kernel_addr: GuestAddress, cmdline_addr: GuestAddress, cmdline_size: usize, setup_data: Option<GuestAddress>, initrd: Option<(GuestAddress, usize)>, mut params: boot_params, ) -> Result<()>253 fn configure_system(
254     guest_mem: &GuestMemory,
255     kernel_addr: GuestAddress,
256     cmdline_addr: GuestAddress,
257     cmdline_size: usize,
258     setup_data: Option<GuestAddress>,
259     initrd: Option<(GuestAddress, usize)>,
260     mut params: boot_params,
261 ) -> Result<()> {
262     const EBDA_START: u64 = 0x0009_fc00;
263     const KERNEL_BOOT_FLAG_MAGIC: u16 = 0xaa55;
264     const KERNEL_HDR_MAGIC: u32 = 0x5372_6448;
265     const KERNEL_LOADER_OTHER: u8 = 0xff;
266     const KERNEL_MIN_ALIGNMENT_BYTES: u32 = 0x100_0000; // Must be non-zero.
267     let first_addr_past_32bits = GuestAddress(FIRST_ADDR_PAST_32BITS);
268     let end_32bit_gap_start = GuestAddress(END_ADDR_BEFORE_32BITS);
269 
270     params.hdr.type_of_loader = KERNEL_LOADER_OTHER;
271     params.hdr.boot_flag = KERNEL_BOOT_FLAG_MAGIC;
272     params.hdr.header = KERNEL_HDR_MAGIC;
273     params.hdr.cmd_line_ptr = cmdline_addr.offset() as u32;
274     params.hdr.cmdline_size = cmdline_size as u32;
275     params.hdr.kernel_alignment = KERNEL_MIN_ALIGNMENT_BYTES;
276     if let Some(setup_data) = setup_data {
277         params.hdr.setup_data = setup_data.offset();
278     }
279     if let Some((initrd_addr, initrd_size)) = initrd {
280         params.hdr.ramdisk_image = initrd_addr.offset() as u32;
281         params.hdr.ramdisk_size = initrd_size as u32;
282     }
283 
284     add_e820_entry(
285         &mut params,
286         START_OF_RAM_32BITS,
287         EBDA_START - START_OF_RAM_32BITS,
288         E820Type::Ram,
289     )?;
290 
291     let mem_end = guest_mem.end_addr();
292     if mem_end < end_32bit_gap_start {
293         add_e820_entry(
294             &mut params,
295             kernel_addr.offset() as u64,
296             mem_end.offset_from(kernel_addr) as u64,
297             E820Type::Ram,
298         )?;
299     } else {
300         add_e820_entry(
301             &mut params,
302             kernel_addr.offset() as u64,
303             end_32bit_gap_start.offset_from(kernel_addr) as u64,
304             E820Type::Ram,
305         )?;
306         if mem_end > first_addr_past_32bits {
307             add_e820_entry(
308                 &mut params,
309                 first_addr_past_32bits.offset() as u64,
310                 mem_end.offset_from(first_addr_past_32bits) as u64,
311                 E820Type::Ram,
312             )?;
313         }
314     }
315 
316     add_e820_entry(
317         &mut params,
318         PCIE_CFG_MMIO_START,
319         PCIE_CFG_MMIO_SIZE,
320         E820Type::Reserved,
321     )?;
322 
323     add_e820_entry(
324         &mut params,
325         X8664arch::get_pcie_vcfg_mmio_base(guest_mem),
326         PCIE_VCFG_MMIO_SIZE,
327         E820Type::Reserved,
328     )?;
329 
330     let zero_page_addr = GuestAddress(ZERO_PAGE_OFFSET);
331     guest_mem
332         .checked_offset(zero_page_addr, mem::size_of::<boot_params>() as u64)
333         .ok_or(Error::ZeroPagePastRamEnd)?;
334     guest_mem
335         .write_obj_at_addr(params, zero_page_addr)
336         .map_err(|_| Error::ZeroPageSetup)?;
337 
338     Ok(())
339 }
340 
341 /// Add an e820 region to the e820 map.
342 /// Returns Ok(()) if successful, or an error if there is no space left in the map.
add_e820_entry( params: &mut boot_params, addr: u64, size: u64, mem_type: E820Type, ) -> Result<()>343 fn add_e820_entry(
344     params: &mut boot_params,
345     addr: u64,
346     size: u64,
347     mem_type: E820Type,
348 ) -> Result<()> {
349     if params.e820_entries >= params.e820_table.len() as u8 {
350         return Err(Error::E820Configuration);
351     }
352 
353     params.e820_table[params.e820_entries as usize].addr = addr;
354     params.e820_table[params.e820_entries as usize].size = size;
355     params.e820_table[params.e820_entries as usize].type_ = mem_type as u32;
356     params.e820_entries += 1;
357 
358     Ok(())
359 }
360 
361 /// Returns a Vec of the valid memory addresses.
362 /// These should be used to configure the GuestMemory structure for the platform.
363 /// For x86_64 all addresses are valid from the start of the kernel except a
364 /// carve out at the end of 32bit address space.
arch_memory_regions(size: u64, bios_size: Option<u64>) -> Vec<(GuestAddress, u64)>365 fn arch_memory_regions(size: u64, bios_size: Option<u64>) -> Vec<(GuestAddress, u64)> {
366     let mem_start = START_OF_RAM_32BITS;
367     let mem_end = GuestAddress(size + mem_start);
368     let first_addr_past_32bits = GuestAddress(FIRST_ADDR_PAST_32BITS);
369     let end_32bit_gap_start = GuestAddress(END_ADDR_BEFORE_32BITS);
370     let mut regions = Vec::new();
371     if mem_end <= end_32bit_gap_start {
372         regions.push((GuestAddress(mem_start), size));
373         if let Some(bios_size) = bios_size {
374             regions.push((bios_start(bios_size), bios_size));
375         }
376     } else {
377         regions.push((
378             GuestAddress(mem_start),
379             end_32bit_gap_start.offset() - mem_start,
380         ));
381         if let Some(bios_size) = bios_size {
382             regions.push((bios_start(bios_size), bios_size));
383         }
384         regions.push((
385             first_addr_past_32bits,
386             mem_end.offset_from(end_32bit_gap_start),
387         ));
388     }
389 
390     regions
391 }
392 
393 impl arch::LinuxArch for X8664arch {
394     type Error = Error;
395 
guest_memory_layout( components: &VmComponents, ) -> std::result::Result<Vec<(GuestAddress, u64)>, Self::Error>396     fn guest_memory_layout(
397         components: &VmComponents,
398     ) -> std::result::Result<Vec<(GuestAddress, u64)>, Self::Error> {
399         let bios_size = match &components.vm_image {
400             VmImage::Bios(bios_file) => Some(bios_file.metadata().map_err(Error::LoadBios)?.len()),
401             VmImage::Kernel(_) => None,
402         };
403         Ok(arch_memory_regions(components.memory_size, bios_size))
404     }
405 
get_system_allocator_config<V: Vm>(vm: &V) -> SystemAllocatorConfig406     fn get_system_allocator_config<V: Vm>(vm: &V) -> SystemAllocatorConfig {
407         let guest_mem = vm.get_memory();
408         let high_mmio_start = Self::get_high_mmio_base(guest_mem);
409         let high_mmio_size = Self::get_high_mmio_size(vm);
410         SystemAllocatorConfig {
411             io: Some(MemRegion {
412                 base: 0xc000,
413                 size: 0x4000,
414             }),
415             low_mmio: MemRegion {
416                 base: END_ADDR_BEFORE_32BITS,
417                 size: PCI_MMIO_SIZE,
418             },
419             high_mmio: MemRegion {
420                 base: high_mmio_start,
421                 size: high_mmio_size,
422             },
423             platform_mmio: None,
424             first_irq: X86_64_IRQ_BASE,
425         }
426     }
427 
build_vm<V, Vcpu>( mut components: VmComponents, exit_evt: &Event, reset_evt: &Event, system_allocator: &mut SystemAllocator, serial_parameters: &BTreeMap<(SerialHardware, u8), SerialParameters>, serial_jail: Option<Minijail>, battery: (&Option<BatteryType>, Option<Minijail>), mut vm: V, ramoops_region: Option<arch::pstore::RamoopsRegion>, devs: Vec<(Box<dyn BusDeviceObj>, Option<Minijail>)>, irq_chip: &mut dyn IrqChipX86_64, kvm_vcpu_ids: &mut Vec<usize>, ) -> std::result::Result<RunnableLinuxVm<V, Vcpu>, Self::Error> where V: VmX86_64, Vcpu: VcpuX86_64,428     fn build_vm<V, Vcpu>(
429         mut components: VmComponents,
430         exit_evt: &Event,
431         reset_evt: &Event,
432         system_allocator: &mut SystemAllocator,
433         serial_parameters: &BTreeMap<(SerialHardware, u8), SerialParameters>,
434         serial_jail: Option<Minijail>,
435         battery: (&Option<BatteryType>, Option<Minijail>),
436         mut vm: V,
437         ramoops_region: Option<arch::pstore::RamoopsRegion>,
438         devs: Vec<(Box<dyn BusDeviceObj>, Option<Minijail>)>,
439         irq_chip: &mut dyn IrqChipX86_64,
440         kvm_vcpu_ids: &mut Vec<usize>,
441     ) -> std::result::Result<RunnableLinuxVm<V, Vcpu>, Self::Error>
442     where
443         V: VmX86_64,
444         Vcpu: VcpuX86_64,
445     {
446         if components.protected_vm != ProtectionType::Unprotected {
447             return Err(Error::UnsupportedProtectionType);
448         }
449 
450         let mem = vm.get_memory().clone();
451 
452         let vcpu_count = components.vcpu_count;
453 
454         let tss_addr = GuestAddress(TSS_ADDR);
455         vm.set_tss_addr(tss_addr).map_err(Error::SetTssAddr)?;
456 
457         // Use IRQ info in ACPI if provided by the user.
458         let mut noirq = true;
459         let mut mptable = true;
460         let mut sci_irq = X86_64_SCI_IRQ;
461 
462         for sdt in components.acpi_sdts.iter() {
463             if sdt.is_signature(b"DSDT") || sdt.is_signature(b"APIC") {
464                 noirq = false;
465             } else if sdt.is_signature(b"FACP") {
466                 mptable = false;
467                 let sci_irq_fadt: u16 = sdt.read(acpi::FADT_FIELD_SCI_INTERRUPT);
468                 sci_irq = sci_irq_fadt.into();
469                 if !system_allocator.reserve_irq(sci_irq) {
470                     warn!("sci irq {} already reserved.", sci_irq);
471                 }
472             }
473         }
474 
475         let mmio_bus = Arc::new(devices::Bus::new());
476         let io_bus = Arc::new(devices::Bus::new());
477 
478         let (pci_devices, _others): (Vec<_>, Vec<_>) = devs
479             .into_iter()
480             .partition(|(dev, _)| dev.as_pci_device().is_some());
481 
482         let pci_devices = pci_devices
483             .into_iter()
484             .map(|(dev, jail_orig)| (dev.into_pci_device().unwrap(), jail_orig))
485             .collect();
486 
487         let (pci, pci_irqs, pid_debug_label_map) = arch::generate_pci_root(
488             pci_devices,
489             irq_chip.as_irq_chip_mut(),
490             mmio_bus.clone(),
491             io_bus.clone(),
492             system_allocator,
493             &mut vm,
494             4, // Share the four pin interrupts (INTx#)
495         )
496         .map_err(Error::CreatePciRoot)?;
497 
498         let pci = Arc::new(Mutex::new(pci));
499         pci.lock().enable_pcie_cfg_mmio(PCIE_CFG_MMIO_START);
500         let pci_cfg = PciConfigIo::new(
501             pci.clone(),
502             reset_evt.try_clone().map_err(Error::CloneEvent)?,
503         );
504         let pci_bus = Arc::new(Mutex::new(pci_cfg));
505         io_bus.insert(pci_bus, 0xcf8, 0x8).unwrap();
506 
507         let pcie_cfg_mmio = Arc::new(Mutex::new(PciConfigMmio::new(pci.clone(), 12)));
508         mmio_bus
509             .insert(pcie_cfg_mmio, PCIE_CFG_MMIO_START, PCIE_CFG_MMIO_SIZE)
510             .unwrap();
511 
512         let pcie_vcfg_mmio = Arc::new(Mutex::new(PciVirtualConfigMmio::new(pci.clone(), 12)));
513         mmio_bus
514             .insert(
515                 pcie_vcfg_mmio,
516                 Self::get_pcie_vcfg_mmio_base(&mem),
517                 PCIE_VCFG_MMIO_SIZE,
518             )
519             .unwrap();
520 
521         // Event used to notify crosvm that guest OS is trying to suspend.
522         let suspend_evt = Event::new().map_err(Error::CreateEvent)?;
523 
524         if !components.no_legacy {
525             Self::setup_legacy_devices(
526                 &io_bus,
527                 irq_chip.pit_uses_speaker_port(),
528                 reset_evt.try_clone().map_err(Error::CloneEvent)?,
529                 components.memory_size,
530             )?;
531         }
532         Self::setup_serial_devices(
533             components.protected_vm,
534             irq_chip.as_irq_chip_mut(),
535             &io_bus,
536             serial_parameters,
537             serial_jail,
538         )?;
539 
540         let mut resume_notify_devices = Vec::new();
541 
542         // each bus occupy 1MB mmio for pcie enhanced configuration
543         let max_bus = ((PCIE_CFG_MMIO_SIZE / 0x100000) - 1) as u8;
544 
545         let (acpi_dev_resource, bat_control) = Self::setup_acpi_devices(
546             &mem,
547             &io_bus,
548             system_allocator,
549             suspend_evt.try_clone().map_err(Error::CloneEvent)?,
550             exit_evt.try_clone().map_err(Error::CloneEvent)?,
551             components.acpi_sdts,
552             #[cfg(feature = "direct")]
553             &components.direct_gpe,
554             irq_chip.as_irq_chip_mut(),
555             sci_irq,
556             battery,
557             &mmio_bus,
558             max_bus,
559             &mut resume_notify_devices,
560         )?;
561 
562         irq_chip
563             .finalize_devices(system_allocator, &io_bus, &mmio_bus)
564             .map_err(Error::RegisterIrqfd)?;
565 
566         // All of these bios generated tables are set manually for the benefit of the kernel boot
567         // flow (since there's no BIOS to set it) and for the BIOS boot flow since crosvm doesn't
568         // have a way to pass the BIOS these configs.
569         // This works right now because the only guest BIOS used with crosvm (u-boot) ignores these
570         // tables and the guest OS picks them up.
571         // If another guest does need a way to pass these tables down to it's BIOS, this approach
572         // should be rethought.
573 
574         if mptable {
575             // Note that this puts the mptable at 0x9FC00 in guest physical memory.
576             mptable::setup_mptable(&mem, vcpu_count as u8, &pci_irqs)
577                 .map_err(Error::SetupMptable)?;
578         }
579         smbios::setup_smbios(&mem, components.dmi_path).map_err(Error::SetupSmbios)?;
580 
581         let host_cpus = if components.host_cpu_topology {
582             components.vcpu_affinity.clone()
583         } else {
584             None
585         };
586 
587         // TODO (tjeznach) Write RSDP to bootconfig before writing to memory
588         acpi::create_acpi_tables(
589             &mem,
590             vcpu_count as u8,
591             sci_irq,
592             0xcf9,
593             6, // RST_CPU|SYS_RST
594             &acpi_dev_resource,
595             host_cpus,
596             kvm_vcpu_ids,
597             &pci_irqs,
598             PCIE_CFG_MMIO_START,
599             max_bus,
600             components.force_s2idle,
601         )
602         .ok_or(Error::CreateAcpi)?;
603 
604         let mut cmdline = Self::get_base_linux_cmdline();
605 
606         if noirq {
607             cmdline.insert_str("acpi=noirq").unwrap();
608         }
609 
610         get_serial_cmdline(&mut cmdline, serial_parameters, "io")
611             .map_err(Error::GetSerialCmdline)?;
612 
613         for param in components.extra_kernel_params {
614             cmdline.insert_str(&param).map_err(Error::Cmdline)?;
615         }
616 
617         if let Some(ramoops_region) = ramoops_region {
618             arch::pstore::add_ramoops_kernel_cmdline(&mut cmdline, &ramoops_region)
619                 .map_err(Error::Cmdline)?;
620         }
621 
622         match components.vm_image {
623             VmImage::Bios(ref mut bios) => {
624                 // Allow a bios to hardcode CMDLINE_OFFSET and read the kernel command line from it.
625                 kernel_loader::load_cmdline(
626                     &mem,
627                     GuestAddress(CMDLINE_OFFSET),
628                     &CString::new(cmdline).unwrap(),
629                 )
630                 .map_err(Error::LoadCmdline)?;
631                 Self::load_bios(&mem, bios)?
632             }
633             VmImage::Kernel(ref mut kernel_image) => {
634                 // separate out load_kernel from other setup to get a specific error for
635                 // kernel loading
636                 let (params, kernel_end) = Self::load_kernel(&mem, kernel_image)?;
637 
638                 Self::setup_system_memory(
639                     &mem,
640                     &CString::new(cmdline).unwrap(),
641                     components.initrd_image,
642                     components.android_fstab,
643                     kernel_end,
644                     params,
645                 )?;
646             }
647         }
648 
649         Ok(RunnableLinuxVm {
650             vm,
651             vcpu_count,
652             vcpus: None,
653             vcpu_affinity: components.vcpu_affinity,
654             no_smt: components.no_smt,
655             irq_chip: irq_chip.try_box_clone().map_err(Error::CloneIrqChip)?,
656             has_bios: matches!(components.vm_image, VmImage::Bios(_)),
657             io_bus,
658             mmio_bus,
659             pid_debug_label_map,
660             suspend_evt,
661             resume_notify_devices,
662             rt_cpus: components.rt_cpus,
663             delay_rt: components.delay_rt,
664             bat_control,
665             #[cfg(all(target_arch = "x86_64", feature = "gdb"))]
666             gdb: components.gdb,
667             pm: Some(acpi_dev_resource.pm),
668             root_config: pci,
669             hotplug_bus: Vec::new(),
670         })
671     }
672 
configure_vcpu<V: Vm>( vm: &V, hypervisor: &dyn HypervisorX86_64, irq_chip: &mut dyn IrqChipX86_64, vcpu: &mut dyn VcpuX86_64, vcpu_id: usize, num_cpus: usize, has_bios: bool, no_smt: bool, host_cpu_topology: bool, ) -> Result<()>673     fn configure_vcpu<V: Vm>(
674         vm: &V,
675         hypervisor: &dyn HypervisorX86_64,
676         irq_chip: &mut dyn IrqChipX86_64,
677         vcpu: &mut dyn VcpuX86_64,
678         vcpu_id: usize,
679         num_cpus: usize,
680         has_bios: bool,
681         no_smt: bool,
682         host_cpu_topology: bool,
683     ) -> Result<()> {
684         cpuid::setup_cpuid(
685             hypervisor,
686             irq_chip,
687             vcpu,
688             vcpu_id,
689             num_cpus,
690             no_smt,
691             host_cpu_topology,
692         )
693         .map_err(Error::SetupCpuid)?;
694 
695         if has_bios {
696             return Ok(());
697         }
698 
699         let guest_mem = vm.get_memory();
700         let kernel_load_addr = GuestAddress(KERNEL_START_OFFSET);
701         regs::setup_msrs(vm, vcpu, END_ADDR_BEFORE_32BITS).map_err(Error::SetupMsrs)?;
702         let kernel_end = guest_mem
703             .checked_offset(kernel_load_addr, KERNEL_64BIT_ENTRY_OFFSET)
704             .ok_or(Error::KernelOffsetPastEnd)?;
705         regs::setup_regs(
706             vcpu,
707             (kernel_end).offset() as u64,
708             BOOT_STACK_POINTER as u64,
709             ZERO_PAGE_OFFSET as u64,
710         )
711         .map_err(Error::SetupRegs)?;
712         regs::setup_fpu(vcpu).map_err(Error::SetupFpu)?;
713         regs::setup_sregs(guest_mem, vcpu).map_err(Error::SetupSregs)?;
714         interrupts::set_lint(vcpu_id, irq_chip).map_err(Error::SetLint)?;
715 
716         Ok(())
717     }
718 
register_pci_device<V: VmX86_64, Vcpu: VcpuX86_64>( linux: &mut RunnableLinuxVm<V, Vcpu>, device: Box<dyn PciDevice>, minijail: Option<Minijail>, resources: &mut SystemAllocator, ) -> Result<PciAddress>719     fn register_pci_device<V: VmX86_64, Vcpu: VcpuX86_64>(
720         linux: &mut RunnableLinuxVm<V, Vcpu>,
721         device: Box<dyn PciDevice>,
722         minijail: Option<Minijail>,
723         resources: &mut SystemAllocator,
724     ) -> Result<PciAddress> {
725         let pci_address = arch::configure_pci_device(linux, device, minijail, resources)
726             .map_err(Error::ConfigurePciDevice)?;
727 
728         Ok(pci_address)
729     }
730 
731     #[cfg(all(target_arch = "x86_64", feature = "gdb"))]
debug_read_registers<T: VcpuX86_64>(vcpu: &T) -> Result<X86_64CoreRegs>732     fn debug_read_registers<T: VcpuX86_64>(vcpu: &T) -> Result<X86_64CoreRegs> {
733         // General registers: RAX, RBX, RCX, RDX, RSI, RDI, RBP, RSP, r8-r15
734         let gregs = vcpu.get_regs().map_err(Error::ReadRegs)?;
735         let regs = [
736             gregs.rax, gregs.rbx, gregs.rcx, gregs.rdx, gregs.rsi, gregs.rdi, gregs.rbp, gregs.rsp,
737             gregs.r8, gregs.r9, gregs.r10, gregs.r11, gregs.r12, gregs.r13, gregs.r14, gregs.r15,
738         ];
739 
740         // GDB exposes 32-bit eflags instead of 64-bit rflags.
741         // https://github.com/bminor/binutils-gdb/blob/master/gdb/features/i386/64bit-core.xml
742         let eflags = gregs.rflags as u32;
743         let rip = gregs.rip;
744 
745         // Segment registers: CS, SS, DS, ES, FS, GS
746         let sregs = vcpu.get_sregs().map_err(Error::ReadRegs)?;
747         let segments = X86SegmentRegs {
748             cs: sregs.cs.selector as u32,
749             ss: sregs.ss.selector as u32,
750             ds: sregs.ds.selector as u32,
751             es: sregs.es.selector as u32,
752             fs: sregs.fs.selector as u32,
753             gs: sregs.gs.selector as u32,
754         };
755 
756         // TODO(keiichiw): Other registers such as FPU, xmm and mxcsr.
757 
758         Ok(X86_64CoreRegs {
759             regs,
760             eflags,
761             rip,
762             segments,
763             ..Default::default()
764         })
765     }
766 
767     #[cfg(all(target_arch = "x86_64", feature = "gdb"))]
debug_write_registers<T: VcpuX86_64>(vcpu: &T, regs: &X86_64CoreRegs) -> Result<()>768     fn debug_write_registers<T: VcpuX86_64>(vcpu: &T, regs: &X86_64CoreRegs) -> Result<()> {
769         // General purpose registers (RAX, RBX, RCX, RDX, RSI, RDI, RBP, RSP, r8-r15) + RIP + rflags
770         let orig_gregs = vcpu.get_regs().map_err(Error::ReadRegs)?;
771         let gregs = Regs {
772             rax: regs.regs[0],
773             rbx: regs.regs[1],
774             rcx: regs.regs[2],
775             rdx: regs.regs[3],
776             rsi: regs.regs[4],
777             rdi: regs.regs[5],
778             rbp: regs.regs[6],
779             rsp: regs.regs[7],
780             r8: regs.regs[8],
781             r9: regs.regs[9],
782             r10: regs.regs[10],
783             r11: regs.regs[11],
784             r12: regs.regs[12],
785             r13: regs.regs[13],
786             r14: regs.regs[14],
787             r15: regs.regs[15],
788             rip: regs.rip,
789             // Update the lower 32 bits of rflags.
790             rflags: (orig_gregs.rflags & !(u32::MAX as u64)) | (regs.eflags as u64),
791         };
792         vcpu.set_regs(&gregs).map_err(Error::WriteRegs)?;
793 
794         // Segment registers: CS, SS, DS, ES, FS, GS
795         // Since GDB care only selectors, we call get_sregs() first.
796         let mut sregs = vcpu.get_sregs().map_err(Error::ReadRegs)?;
797         sregs.cs.selector = regs.segments.cs as u16;
798         sregs.ss.selector = regs.segments.ss as u16;
799         sregs.ds.selector = regs.segments.ds as u16;
800         sregs.es.selector = regs.segments.es as u16;
801         sregs.fs.selector = regs.segments.fs as u16;
802         sregs.gs.selector = regs.segments.gs as u16;
803 
804         vcpu.set_sregs(&sregs).map_err(Error::WriteRegs)?;
805 
806         // TODO(keiichiw): Other registers such as FPU, xmm and mxcsr.
807 
808         Ok(())
809     }
810 
811     #[cfg(all(target_arch = "x86_64", feature = "gdb"))]
debug_read_memory<T: VcpuX86_64>( vcpu: &T, guest_mem: &GuestMemory, vaddr: GuestAddress, len: usize, ) -> Result<Vec<u8>>812     fn debug_read_memory<T: VcpuX86_64>(
813         vcpu: &T,
814         guest_mem: &GuestMemory,
815         vaddr: GuestAddress,
816         len: usize,
817     ) -> Result<Vec<u8>> {
818         let sregs = vcpu.get_sregs().map_err(Error::ReadRegs)?;
819         let mut buf = vec![0; len];
820         let mut total_read = 0u64;
821         // Handle reads across page boundaries.
822 
823         while total_read < len as u64 {
824             let (paddr, psize) = phys_addr(guest_mem, vaddr.0 + total_read, &sregs)?;
825             let read_len = std::cmp::min(len as u64 - total_read, psize - (paddr & (psize - 1)));
826             guest_mem
827                 .get_slice_at_addr(GuestAddress(paddr), read_len as usize)
828                 .map_err(Error::ReadingGuestMemory)?
829                 .copy_to(&mut buf[total_read as usize..]);
830             total_read += read_len;
831         }
832         Ok(buf)
833     }
834 
835     #[cfg(all(target_arch = "x86_64", feature = "gdb"))]
debug_write_memory<T: VcpuX86_64>( vcpu: &T, guest_mem: &GuestMemory, vaddr: GuestAddress, buf: &[u8], ) -> Result<()>836     fn debug_write_memory<T: VcpuX86_64>(
837         vcpu: &T,
838         guest_mem: &GuestMemory,
839         vaddr: GuestAddress,
840         buf: &[u8],
841     ) -> Result<()> {
842         let sregs = vcpu.get_sregs().map_err(Error::ReadRegs)?;
843         let mut total_written = 0u64;
844         // Handle writes across page boundaries.
845         while total_written < buf.len() as u64 {
846             let (paddr, psize) = phys_addr(guest_mem, vaddr.0 + total_written, &sregs)?;
847             let write_len = std::cmp::min(
848                 buf.len() as u64 - total_written,
849                 psize - (paddr & (psize - 1)),
850             );
851 
852             guest_mem
853                 .write_all_at_addr(
854                     &buf[total_written as usize..(total_written as usize + write_len as usize)],
855                     GuestAddress(paddr),
856                 )
857                 .map_err(Error::WritingGuestMemory)?;
858             total_written += write_len;
859         }
860         Ok(())
861     }
862 
863     #[cfg(all(target_arch = "x86_64", feature = "gdb"))]
debug_enable_singlestep<T: VcpuX86_64>(vcpu: &T) -> Result<()>864     fn debug_enable_singlestep<T: VcpuX86_64>(vcpu: &T) -> Result<()> {
865         vcpu.set_guest_debug(&[], true /* enable_singlestep */)
866             .map_err(Error::EnableSinglestep)
867     }
868 
869     #[cfg(all(target_arch = "x86_64", feature = "gdb"))]
debug_set_hw_breakpoints<T: VcpuX86_64>( vcpu: &T, breakpoints: &[GuestAddress], ) -> Result<()>870     fn debug_set_hw_breakpoints<T: VcpuX86_64>(
871         vcpu: &T,
872         breakpoints: &[GuestAddress],
873     ) -> Result<()> {
874         vcpu.set_guest_debug(breakpoints, false /* enable_singlestep */)
875             .map_err(Error::SetHwBreakpoint)
876     }
877 }
878 
879 #[cfg(all(target_arch = "x86_64", feature = "gdb"))]
880 // return the translated address and the size of the page it resides in.
phys_addr(mem: &GuestMemory, vaddr: u64, sregs: &Sregs) -> Result<(u64, u64)>881 fn phys_addr(mem: &GuestMemory, vaddr: u64, sregs: &Sregs) -> Result<(u64, u64)> {
882     const CR0_PG_MASK: u64 = 1 << 31;
883     const CR4_PAE_MASK: u64 = 1 << 5;
884     const CR4_LA57_MASK: u64 = 1 << 12;
885     const MSR_EFER_LMA: u64 = 1 << 10;
886     // bits 12 through 51 are the address in a PTE.
887     const PTE_ADDR_MASK: u64 = ((1 << 52) - 1) & !0x0fff;
888     const PAGE_PRESENT: u64 = 0x1;
889     const PAGE_PSE_MASK: u64 = 0x1 << 7;
890 
891     const PAGE_SIZE_4K: u64 = 4 * 1024;
892     const PAGE_SIZE_2M: u64 = 2 * 1024 * 1024;
893     const PAGE_SIZE_1G: u64 = 1024 * 1024 * 1024;
894 
895     fn next_pte(mem: &GuestMemory, curr_table_addr: u64, vaddr: u64, level: usize) -> Result<u64> {
896         let ent: u64 = mem
897             .read_obj_from_addr(GuestAddress(
898                 (curr_table_addr & PTE_ADDR_MASK) + page_table_offset(vaddr, level),
899             ))
900             .map_err(|_| Error::TranslatingVirtAddr)?;
901         /* TODO - convert to a trace
902         println!(
903             "level {} vaddr {:x} table-addr {:x} mask {:x} ent {:x} offset {:x}",
904             level,
905             vaddr,
906             curr_table_addr,
907             PTE_ADDR_MASK,
908             ent,
909             page_table_offset(vaddr, level)
910         );
911         */
912         if ent & PAGE_PRESENT == 0 {
913             return Err(Error::PageNotPresent);
914         }
915         Ok(ent)
916     }
917 
918     // Get the offset in to the page of `vaddr`.
919     fn page_offset(vaddr: u64, page_size: u64) -> u64 {
920         vaddr & (page_size - 1)
921     }
922 
923     // Get the offset in to the page table of the given `level` specified by the virtual `address`.
924     // `level` is 1 through 5 in x86_64 to handle the five levels of paging.
925     fn page_table_offset(addr: u64, level: usize) -> u64 {
926         let offset = (level - 1) * 9 + 12;
927         ((addr >> offset) & 0x1ff) << 3
928     }
929 
930     if sregs.cr0 & CR0_PG_MASK == 0 {
931         return Ok((vaddr, PAGE_SIZE_4K));
932     }
933 
934     if sregs.cr4 & CR4_PAE_MASK == 0 {
935         return Err(Error::TranslatingVirtAddr);
936     }
937 
938     if sregs.efer & MSR_EFER_LMA != 0 {
939         // TODO - check LA57
940         if sregs.cr4 & CR4_LA57_MASK != 0 {}
941         let p4_ent = next_pte(mem, sregs.cr3, vaddr, 4)?;
942         let p3_ent = next_pte(mem, p4_ent, vaddr, 3)?;
943         // TODO check if it's a 1G page with the PSE bit in p2_ent
944         if p3_ent & PAGE_PSE_MASK != 0 {
945             // It's a 1G page with the PSE bit in p3_ent
946             let paddr = p3_ent & PTE_ADDR_MASK | page_offset(vaddr, PAGE_SIZE_1G);
947             return Ok((paddr, PAGE_SIZE_1G));
948         }
949         let p2_ent = next_pte(mem, p3_ent, vaddr, 2)?;
950         if p2_ent & PAGE_PSE_MASK != 0 {
951             // It's a 2M page with the PSE bit in p2_ent
952             let paddr = p2_ent & PTE_ADDR_MASK | page_offset(vaddr, PAGE_SIZE_2M);
953             return Ok((paddr, PAGE_SIZE_2M));
954         }
955         let p1_ent = next_pte(mem, p2_ent, vaddr, 1)?;
956         let paddr = p1_ent & PTE_ADDR_MASK | page_offset(vaddr, PAGE_SIZE_4K);
957         return Ok((paddr, PAGE_SIZE_4K));
958     }
959     Err(Error::TranslatingVirtAddr)
960 }
961 
962 // OSC returned status register in CDW1
963 const OSC_STATUS_UNSUPPORT_UUID: u32 = 0x4;
964 // pci host bridge OSC returned control register in CDW3
965 #[allow(dead_code)]
966 const PCI_HB_OSC_CONTROL_PCIE_HP: u32 = 0x1;
967 const PCI_HB_OSC_CONTROL_SHPC_HP: u32 = 0x2;
968 const PCI_HB_OSC_CONTROL_PCIE_PME: u32 = 0x4;
969 const PCI_HB_OSC_CONTROL_PCIE_AER: u32 = 0x8;
970 #[allow(dead_code)]
971 const PCI_HB_OSC_CONTROL_PCIE_CAP: u32 = 0x10;
972 
973 struct PciRootOSC {}
974 
975 // Method (_OSC, 4, NotSerialized)  // _OSC: Operating System Capabilities
976 // {
977 //     CreateDWordField (Arg3, Zero, CDW1)  // flag and return value
978 //     If (Arg0 == ToUUID ("33db4d5b-1ff7-401c-9657-7441c03dd766"))
979 //     {
980 //         CreateDWordField (Arg3, 8, CDW3) // control field
981 //         if ( 0 == (CDW1 & 0x01))  // Query flag ?
982 //         {
983 //              CDW3 &= !(SHPC_HP | PME | AER)
984 //         }
985 //     } Else {
986 //         CDW1 |= UNSUPPORT_UUID
987 //     }
988 //     Return (Arg3)
989 // }
990 impl Aml for PciRootOSC {
to_aml_bytes(&self, aml: &mut Vec<u8>)991     fn to_aml_bytes(&self, aml: &mut Vec<u8>) {
992         let osc_uuid = "33DB4D5B-1FF7-401C-9657-7441C03DD766";
993         // virtual pcie root port supports hotplug and pcie cap register only, clear all
994         // the other bits.
995         let mask = !(PCI_HB_OSC_CONTROL_SHPC_HP
996             | PCI_HB_OSC_CONTROL_PCIE_PME
997             | PCI_HB_OSC_CONTROL_PCIE_AER);
998         aml::Method::new(
999             "_OSC".into(),
1000             4,
1001             false,
1002             vec![
1003                 &aml::CreateDWordField::new(
1004                     &aml::Name::new_field_name("CDW1"),
1005                     &aml::Arg(3),
1006                     &aml::ZERO,
1007                 ),
1008                 &aml::If::new(
1009                     &aml::Equal::new(&aml::Arg(0), &aml::Uuid::new(osc_uuid)),
1010                     vec![
1011                         &aml::CreateDWordField::new(
1012                             &aml::Name::new_field_name("CDW3"),
1013                             &aml::Arg(3),
1014                             &(8_u8),
1015                         ),
1016                         &aml::If::new(
1017                             &aml::Equal::new(
1018                                 &aml::ZERO,
1019                                 &aml::And::new(
1020                                     &aml::Local(0),
1021                                     &aml::Name::new_field_name("CDW1"),
1022                                     &aml::ONE,
1023                                 ),
1024                             ),
1025                             vec![&aml::And::new(
1026                                 &aml::Name::new_field_name("CDW3"),
1027                                 &mask,
1028                                 &aml::Name::new_field_name("CDW3"),
1029                             )],
1030                         ),
1031                     ],
1032                 ),
1033                 &aml::Else::new(vec![&aml::Or::new(
1034                     &aml::Name::new_field_name("CDW1"),
1035                     &OSC_STATUS_UNSUPPORT_UUID,
1036                     &aml::Name::new_field_name("CDW1"),
1037                 )]),
1038                 &aml::Return::new(&aml::Arg(3)),
1039             ],
1040         )
1041         .to_aml_bytes(aml)
1042     }
1043 }
1044 
1045 impl X8664arch {
1046     /// Loads the bios from an open file.
1047     ///
1048     /// # Arguments
1049     ///
1050     /// * `mem` - The memory to be used by the guest.
1051     /// * `bios_image` - the File object for the specified bios
load_bios(mem: &GuestMemory, bios_image: &mut File) -> Result<()>1052     fn load_bios(mem: &GuestMemory, bios_image: &mut File) -> Result<()> {
1053         let bios_image_length = bios_image
1054             .seek(io::SeekFrom::End(0))
1055             .map_err(Error::LoadBios)?;
1056         if bios_image_length >= FIRST_ADDR_PAST_32BITS {
1057             return Err(Error::LoadBios(io::Error::new(
1058                 io::ErrorKind::InvalidData,
1059                 format!(
1060                     "bios was {} bytes, expected less than {}",
1061                     bios_image_length, FIRST_ADDR_PAST_32BITS,
1062                 ),
1063             )));
1064         }
1065         bios_image
1066             .seek(io::SeekFrom::Start(0))
1067             .map_err(Error::LoadBios)?;
1068         mem.read_to_memory(
1069             bios_start(bios_image_length),
1070             bios_image,
1071             bios_image_length as usize,
1072         )
1073         .map_err(Error::SetupGuestMemory)?;
1074         Ok(())
1075     }
1076 
1077     /// Loads the kernel from an open file.
1078     ///
1079     /// # Arguments
1080     ///
1081     /// * `mem` - The memory to be used by the guest.
1082     /// * `kernel_image` - the File object for the specified kernel.
load_kernel(mem: &GuestMemory, kernel_image: &mut File) -> Result<(boot_params, u64)>1083     fn load_kernel(mem: &GuestMemory, kernel_image: &mut File) -> Result<(boot_params, u64)> {
1084         let elf_result =
1085             kernel_loader::load_kernel(mem, GuestAddress(KERNEL_START_OFFSET), kernel_image);
1086         if elf_result == Err(kernel_loader::Error::InvalidElfMagicNumber) {
1087             bzimage::load_bzimage(mem, GuestAddress(KERNEL_START_OFFSET), kernel_image)
1088                 .map_err(Error::LoadBzImage)
1089         } else {
1090             let kernel_end = elf_result.map_err(Error::LoadKernel)?;
1091             Ok((Default::default(), kernel_end))
1092         }
1093     }
1094 
1095     /// Configures the system memory space should be called once per vm before
1096     /// starting vcpu threads.
1097     ///
1098     /// # Arguments
1099     ///
1100     /// * `mem` - The memory to be used by the guest.
1101     /// * `cmdline` - the kernel commandline
1102     /// * `initrd_file` - an initial ramdisk image
setup_system_memory( mem: &GuestMemory, cmdline: &CStr, initrd_file: Option<File>, android_fstab: Option<File>, kernel_end: u64, params: boot_params, ) -> Result<()>1103     fn setup_system_memory(
1104         mem: &GuestMemory,
1105         cmdline: &CStr,
1106         initrd_file: Option<File>,
1107         android_fstab: Option<File>,
1108         kernel_end: u64,
1109         params: boot_params,
1110     ) -> Result<()> {
1111         kernel_loader::load_cmdline(mem, GuestAddress(CMDLINE_OFFSET), cmdline)
1112             .map_err(Error::LoadCmdline)?;
1113 
1114         // Track the first free address after the kernel - this is where extra
1115         // data like the device tree blob and initrd will be loaded.
1116         let mut free_addr = kernel_end;
1117 
1118         let setup_data = if let Some(android_fstab) = android_fstab {
1119             let free_addr_aligned = (((free_addr + 64 - 1) / 64) * 64) + 64;
1120             let dtb_start = GuestAddress(free_addr_aligned);
1121             let dtb_size = fdt::create_fdt(
1122                 X86_64_FDT_MAX_SIZE as usize,
1123                 mem,
1124                 dtb_start.offset(),
1125                 android_fstab,
1126             )
1127             .map_err(Error::CreateFdt)?;
1128             free_addr = dtb_start.offset() + dtb_size as u64;
1129             Some(dtb_start)
1130         } else {
1131             None
1132         };
1133 
1134         let initrd = match initrd_file {
1135             Some(mut initrd_file) => {
1136                 let mut initrd_addr_max = u64::from(params.hdr.initrd_addr_max);
1137                 // Default initrd_addr_max for old kernels (see Documentation/x86/boot.txt).
1138                 if initrd_addr_max == 0 {
1139                     initrd_addr_max = 0x37FFFFFF;
1140                 }
1141 
1142                 let mem_max = mem.end_addr().offset() - 1;
1143                 if initrd_addr_max > mem_max {
1144                     initrd_addr_max = mem_max;
1145                 }
1146 
1147                 let (initrd_start, initrd_size) = arch::load_image_high(
1148                     mem,
1149                     &mut initrd_file,
1150                     GuestAddress(free_addr),
1151                     GuestAddress(initrd_addr_max),
1152                     base::pagesize() as u64,
1153                 )
1154                 .map_err(Error::LoadInitrd)?;
1155                 Some((initrd_start, initrd_size))
1156             }
1157             None => None,
1158         };
1159 
1160         configure_system(
1161             mem,
1162             GuestAddress(KERNEL_START_OFFSET),
1163             GuestAddress(CMDLINE_OFFSET),
1164             cmdline.to_bytes().len() + 1,
1165             setup_data,
1166             initrd,
1167             params,
1168         )?;
1169         Ok(())
1170     }
1171 
get_pcie_vcfg_mmio_base(mem: &GuestMemory) -> u641172     fn get_pcie_vcfg_mmio_base(mem: &GuestMemory) -> u64 {
1173         // Put PCIe VCFG region at a 2MB boundary after physical memory or 4gb, whichever is greater.
1174         let ram_end_round_2mb = (mem.end_addr().offset() + 2 * MB - 1) / (2 * MB) * (2 * MB);
1175         std::cmp::max(ram_end_round_2mb, 4 * GB)
1176     }
1177 
1178     /// This returns the start address of high mmio
1179     ///
1180     /// # Arguments
1181     ///
1182     /// * mem: The memory to be used by the guest
get_high_mmio_base(mem: &GuestMemory) -> u641183     fn get_high_mmio_base(mem: &GuestMemory) -> u64 {
1184         Self::get_pcie_vcfg_mmio_base(mem) + PCIE_VCFG_MMIO_SIZE
1185     }
1186 
1187     /// This returns the size of high mmio
1188     ///
1189     /// # Arguments
1190     ///
1191     /// * `vm`: The virtual machine
get_high_mmio_size<V: Vm>(vm: &V) -> u641192     fn get_high_mmio_size<V: Vm>(vm: &V) -> u64 {
1193         let phys_mem_end = 1u64 << vm.get_guest_phys_addr_bits();
1194         let high_mmio_end = std::cmp::min(phys_mem_end, HIGH_MMIO_MAX_END);
1195         high_mmio_end - Self::get_high_mmio_base(vm.get_memory())
1196     }
1197 
1198     /// This returns a minimal kernel command for this architecture
get_base_linux_cmdline() -> kernel_cmdline::Cmdline1199     fn get_base_linux_cmdline() -> kernel_cmdline::Cmdline {
1200         let mut cmdline = kernel_cmdline::Cmdline::new(CMDLINE_MAX_SIZE as usize);
1201         cmdline.insert_str("panic=-1").unwrap();
1202 
1203         cmdline
1204     }
1205 
1206     /// Sets up the legacy x86 IO platform devices
1207     ///
1208     /// # Arguments
1209     ///
1210     /// * - `io_bus` - the IO bus object
1211     /// * - `pit_uses_speaker_port` - does the PIT use port 0x61 for the PC speaker
1212     /// * - `reset_evt` - the event object which should receive exit events
1213     /// * - `mem_size` - the size in bytes of physical ram for the guest
setup_legacy_devices( io_bus: &devices::Bus, pit_uses_speaker_port: bool, reset_evt: Event, mem_size: u64, ) -> Result<()>1214     fn setup_legacy_devices(
1215         io_bus: &devices::Bus,
1216         pit_uses_speaker_port: bool,
1217         reset_evt: Event,
1218         mem_size: u64,
1219     ) -> Result<()> {
1220         struct NoDevice;
1221         impl devices::BusDevice for NoDevice {
1222             fn debug_label(&self) -> String {
1223                 "no device".to_owned()
1224             }
1225         }
1226 
1227         let mem_regions = arch_memory_regions(mem_size, None);
1228 
1229         let mem_below_4g = mem_regions
1230             .iter()
1231             .filter(|r| r.0.offset() < FIRST_ADDR_PAST_32BITS)
1232             .map(|r| r.1)
1233             .sum();
1234 
1235         let mem_above_4g = mem_regions
1236             .iter()
1237             .filter(|r| r.0.offset() >= FIRST_ADDR_PAST_32BITS)
1238             .map(|r| r.1)
1239             .sum();
1240 
1241         io_bus
1242             .insert(
1243                 Arc::new(Mutex::new(devices::Cmos::new(mem_below_4g, mem_above_4g))),
1244                 0x70,
1245                 0x2,
1246             )
1247             .unwrap();
1248 
1249         let nul_device = Arc::new(Mutex::new(NoDevice));
1250         let i8042 = Arc::new(Mutex::new(devices::I8042Device::new(
1251             reset_evt.try_clone().map_err(Error::CloneEvent)?,
1252         )));
1253 
1254         if pit_uses_speaker_port {
1255             io_bus.insert(i8042, 0x062, 0x3).unwrap();
1256         } else {
1257             io_bus.insert(i8042, 0x061, 0x4).unwrap();
1258         }
1259 
1260         io_bus.insert(nul_device.clone(), 0x0ed, 0x1).unwrap(); // most likely this one does nothing
1261         io_bus.insert(nul_device, 0x0f0, 0x2).unwrap(); // ignore fpu
1262 
1263         Ok(())
1264     }
1265 
1266     /// Sets up the acpi devices for this platform and
1267     /// return the resources which is used to set the ACPI tables.
1268     ///
1269     /// # Arguments
1270     ///
1271     /// * - `io_bus` the I/O bus to add the devices to
1272     /// * - `resources` the SystemAllocator to allocate IO and MMIO for acpi
1273     ///                devices.
1274     /// * - `suspend_evt` the event object which used to suspend the vm
1275     /// * - `sdts` ACPI system description tables
1276     /// * - `irq_chip` the IrqChip object for registering irq events
1277     /// * - `battery` indicate whether to create the battery
1278     /// * - `mmio_bus` the MMIO bus to add the devices to
setup_acpi_devices( mem: &GuestMemory, io_bus: &devices::Bus, resources: &mut SystemAllocator, suspend_evt: Event, exit_evt: Event, sdts: Vec<SDT>, #[cfg(feature = "direct")] direct_gpe: &[u32], irq_chip: &mut dyn IrqChip, sci_irq: u32, battery: (&Option<BatteryType>, Option<Minijail>), mmio_bus: &devices::Bus, max_bus: u8, resume_notify_devices: &mut Vec<Arc<Mutex<dyn BusResumeDevice>>>, ) -> Result<(acpi::AcpiDevResource, Option<BatControl>)>1279     fn setup_acpi_devices(
1280         mem: &GuestMemory,
1281         io_bus: &devices::Bus,
1282         resources: &mut SystemAllocator,
1283         suspend_evt: Event,
1284         exit_evt: Event,
1285         sdts: Vec<SDT>,
1286         #[cfg(feature = "direct")] direct_gpe: &[u32],
1287         irq_chip: &mut dyn IrqChip,
1288         sci_irq: u32,
1289         battery: (&Option<BatteryType>, Option<Minijail>),
1290         mmio_bus: &devices::Bus,
1291         max_bus: u8,
1292         resume_notify_devices: &mut Vec<Arc<Mutex<dyn BusResumeDevice>>>,
1293     ) -> Result<(acpi::AcpiDevResource, Option<BatControl>)> {
1294         // The AML data for the acpi devices
1295         let mut amls = Vec::new();
1296 
1297         let bat_control = if let Some(battery_type) = battery.0 {
1298             match battery_type {
1299                 BatteryType::Goldfish => {
1300                     let control_tube = arch::add_goldfish_battery(
1301                         &mut amls, battery.1, mmio_bus, irq_chip, sci_irq, resources,
1302                     )
1303                     .map_err(Error::CreateBatDevices)?;
1304                     Some(BatControl {
1305                         type_: BatteryType::Goldfish,
1306                         control_tube,
1307                     })
1308                 }
1309             }
1310         } else {
1311             None
1312         };
1313 
1314         let pm_alloc = resources.get_anon_alloc();
1315         let pm_iobase = match resources.io_allocator() {
1316             Some(io) => io
1317                 .allocate_with_align(
1318                     devices::acpi::ACPIPM_RESOURCE_LEN as u64,
1319                     pm_alloc,
1320                     "ACPIPM".to_string(),
1321                     4, // must be 32-bit aligned
1322                 )
1323                 .map_err(Error::AllocateIOResouce)?,
1324             None => 0x600,
1325         };
1326 
1327         let pcie_vcfg = aml::Name::new("VCFG".into(), &Self::get_pcie_vcfg_mmio_base(mem));
1328         pcie_vcfg.to_aml_bytes(&mut amls);
1329 
1330         let pm_sci_evt = devices::IrqLevelEvent::new().map_err(Error::CreateEvent)?;
1331         irq_chip
1332             .register_level_irq_event(sci_irq, &pm_sci_evt)
1333             .map_err(Error::RegisterIrqfd)?;
1334 
1335         #[cfg(feature = "direct")]
1336         let direct_gpe_info = if direct_gpe.is_empty() {
1337             None
1338         } else {
1339             let direct_sci_evt = devices::IrqLevelEvent::new().map_err(Error::CreateEvent)?;
1340             let mut sci_devirq =
1341                 devices::DirectIrq::new_level(&direct_sci_evt).map_err(Error::CreateGpe)?;
1342 
1343             sci_devirq.sci_irq_prepare().map_err(Error::CreateGpe)?;
1344 
1345             for gpe in direct_gpe {
1346                 sci_devirq
1347                     .gpe_enable_forwarding(*gpe)
1348                     .map_err(Error::CreateGpe)?;
1349             }
1350 
1351             Some((direct_sci_evt, direct_gpe))
1352         };
1353 
1354         let mut pmresource = devices::ACPIPMResource::new(
1355             pm_sci_evt,
1356             #[cfg(feature = "direct")]
1357             direct_gpe_info,
1358             suspend_evt,
1359             exit_evt,
1360         );
1361         pmresource.to_aml_bytes(&mut amls);
1362         pmresource.start();
1363 
1364         let mut crs_entries: Vec<Box<dyn Aml>> = vec![
1365             Box::new(aml::AddressSpace::new_bus_number(0x0u16, max_bus as u16)),
1366             Box::new(aml::IO::new(0xcf8, 0xcf8, 1, 0x8)),
1367         ];
1368         for r in resources.mmio_pools() {
1369             let entry: Box<dyn Aml> = match (u32::try_from(*r.start()), u32::try_from(*r.end())) {
1370                 (Ok(start), Ok(end)) => Box::new(aml::AddressSpace::new_memory(
1371                     aml::AddressSpaceCachable::NotCacheable,
1372                     true,
1373                     start,
1374                     end,
1375                 )),
1376                 _ => Box::new(aml::AddressSpace::new_memory(
1377                     aml::AddressSpaceCachable::NotCacheable,
1378                     true,
1379                     *r.start(),
1380                     *r.end(),
1381                 )),
1382             };
1383             crs_entries.push(entry);
1384         }
1385 
1386         let mut pci_dsdt_inner_data: Vec<&dyn aml::Aml> = Vec::new();
1387         let hid = aml::Name::new("_HID".into(), &aml::EISAName::new("PNP0A08"));
1388         pci_dsdt_inner_data.push(&hid);
1389         let cid = aml::Name::new("_CID".into(), &aml::EISAName::new("PNP0A03"));
1390         pci_dsdt_inner_data.push(&cid);
1391         let adr = aml::Name::new("_ADR".into(), &aml::ZERO);
1392         pci_dsdt_inner_data.push(&adr);
1393         let seg = aml::Name::new("_SEG".into(), &aml::ZERO);
1394         pci_dsdt_inner_data.push(&seg);
1395         let uid = aml::Name::new("_UID".into(), &aml::ZERO);
1396         pci_dsdt_inner_data.push(&uid);
1397         let supp = aml::Name::new("SUPP".into(), &aml::ZERO);
1398         pci_dsdt_inner_data.push(&supp);
1399         let crs = aml::Name::new(
1400             "_CRS".into(),
1401             &aml::ResourceTemplate::new(crs_entries.iter().map(|b| b.as_ref()).collect()),
1402         );
1403         pci_dsdt_inner_data.push(&crs);
1404 
1405         let pci_root_osc = PciRootOSC {};
1406         pci_dsdt_inner_data.push(&pci_root_osc);
1407 
1408         aml::Device::new("_SB_.PCI0".into(), pci_dsdt_inner_data).to_aml_bytes(&mut amls);
1409 
1410         let pm = Arc::new(Mutex::new(pmresource));
1411         io_bus
1412             .insert(
1413                 pm.clone(),
1414                 pm_iobase as u64,
1415                 devices::acpi::ACPIPM_RESOURCE_LEN as u64,
1416             )
1417             .unwrap();
1418         resume_notify_devices.push(pm.clone());
1419 
1420         Ok((
1421             acpi::AcpiDevResource {
1422                 amls,
1423                 pm_iobase,
1424                 pm,
1425                 sdts,
1426             },
1427             bat_control,
1428         ))
1429     }
1430 
1431     /// Sets up the serial devices for this platform. Returns the serial port number and serial
1432     /// device to be used for stdout
1433     ///
1434     /// # Arguments
1435     ///
1436     /// * - `irq_chip` the IrqChip object for registering irq events
1437     /// * - `io_bus` the I/O bus to add the devices to
1438     /// * - `serial_parmaters` - definitions for how the serial devices should be configured
setup_serial_devices( protected_vm: ProtectionType, irq_chip: &mut dyn IrqChip, io_bus: &devices::Bus, serial_parameters: &BTreeMap<(SerialHardware, u8), SerialParameters>, serial_jail: Option<Minijail>, ) -> Result<()>1439     fn setup_serial_devices(
1440         protected_vm: ProtectionType,
1441         irq_chip: &mut dyn IrqChip,
1442         io_bus: &devices::Bus,
1443         serial_parameters: &BTreeMap<(SerialHardware, u8), SerialParameters>,
1444         serial_jail: Option<Minijail>,
1445     ) -> Result<()> {
1446         let com_evt_1_3 = devices::IrqEdgeEvent::new().map_err(Error::CreateEvent)?;
1447         let com_evt_2_4 = devices::IrqEdgeEvent::new().map_err(Error::CreateEvent)?;
1448 
1449         arch::add_serial_devices(
1450             protected_vm,
1451             io_bus,
1452             com_evt_1_3.get_trigger(),
1453             com_evt_2_4.get_trigger(),
1454             serial_parameters,
1455             serial_jail,
1456         )
1457         .map_err(Error::CreateSerialDevices)?;
1458 
1459         irq_chip
1460             .register_edge_irq_event(X86_64_SERIAL_1_3_IRQ, &com_evt_1_3)
1461             .map_err(Error::RegisterIrqfd)?;
1462         irq_chip
1463             .register_edge_irq_event(X86_64_SERIAL_2_4_IRQ, &com_evt_2_4)
1464             .map_err(Error::RegisterIrqfd)?;
1465 
1466         Ok(())
1467     }
1468 }
1469 
1470 #[cfg(test)]
1471 mod test_integration;
1472 
1473 #[cfg(test)]
1474 mod tests {
1475     use super::*;
1476 
1477     #[test]
regions_lt_4gb_nobios()1478     fn regions_lt_4gb_nobios() {
1479         let regions = arch_memory_regions(512 * MB, /* bios_size */ None);
1480         assert_eq!(1, regions.len());
1481         assert_eq!(GuestAddress(START_OF_RAM_32BITS), regions[0].0);
1482         assert_eq!(1u64 << 29, regions[0].1);
1483     }
1484 
1485     #[test]
regions_gt_4gb_nobios()1486     fn regions_gt_4gb_nobios() {
1487         let size = 4 * GB + 0x8000;
1488         let regions = arch_memory_regions(size, /* bios_size */ None);
1489         assert_eq!(2, regions.len());
1490         assert_eq!(GuestAddress(START_OF_RAM_32BITS), regions[0].0);
1491         assert_eq!(GuestAddress(4 * GB), regions[1].0);
1492         assert_eq!(4 * GB + 0x8000, regions[0].1 + regions[1].1);
1493     }
1494 
1495     #[test]
regions_lt_4gb_bios()1496     fn regions_lt_4gb_bios() {
1497         let bios_len = 1 * MB;
1498         let regions = arch_memory_regions(512 * MB, Some(bios_len));
1499         assert_eq!(2, regions.len());
1500         assert_eq!(GuestAddress(START_OF_RAM_32BITS), regions[0].0);
1501         assert_eq!(512 * MB, regions[0].1);
1502         assert_eq!(
1503             GuestAddress(FIRST_ADDR_PAST_32BITS - bios_len),
1504             regions[1].0
1505         );
1506         assert_eq!(bios_len, regions[1].1);
1507     }
1508 
1509     #[test]
regions_gt_4gb_bios()1510     fn regions_gt_4gb_bios() {
1511         let bios_len = 1 * MB;
1512         let regions = arch_memory_regions(4 * GB + 0x8000, Some(bios_len));
1513         assert_eq!(3, regions.len());
1514         assert_eq!(GuestAddress(START_OF_RAM_32BITS), regions[0].0);
1515         assert_eq!(
1516             GuestAddress(FIRST_ADDR_PAST_32BITS - bios_len),
1517             regions[1].0
1518         );
1519         assert_eq!(bios_len, regions[1].1);
1520         assert_eq!(GuestAddress(4 * GB), regions[2].0);
1521     }
1522 
1523     #[test]
regions_eq_4gb_nobios()1524     fn regions_eq_4gb_nobios() {
1525         // Test with exact size of 4GB - the overhead.
1526         let regions = arch_memory_regions(
1527             4 * GB - MEM_32BIT_GAP_SIZE - START_OF_RAM_32BITS,
1528             /* bios_size */ None,
1529         );
1530         dbg!(&regions);
1531         assert_eq!(1, regions.len());
1532         assert_eq!(GuestAddress(START_OF_RAM_32BITS), regions[0].0);
1533         assert_eq!(
1534             4 * GB - MEM_32BIT_GAP_SIZE - START_OF_RAM_32BITS,
1535             regions[0].1
1536         );
1537     }
1538 
1539     #[test]
regions_eq_4gb_bios()1540     fn regions_eq_4gb_bios() {
1541         // Test with exact size of 4GB - the overhead.
1542         let bios_len = 1 * MB;
1543         let regions = arch_memory_regions(
1544             4 * GB - MEM_32BIT_GAP_SIZE - START_OF_RAM_32BITS,
1545             Some(bios_len),
1546         );
1547         assert_eq!(2, regions.len());
1548         assert_eq!(GuestAddress(START_OF_RAM_32BITS), regions[0].0);
1549         assert_eq!(
1550             4 * GB - MEM_32BIT_GAP_SIZE - START_OF_RAM_32BITS,
1551             regions[0].1
1552         );
1553         assert_eq!(
1554             GuestAddress(FIRST_ADDR_PAST_32BITS - bios_len),
1555             regions[1].0
1556         );
1557         assert_eq!(bios_len, regions[1].1);
1558     }
1559 
1560     #[test]
1561     #[cfg(feature = "direct")]
end_addr_before_32bits()1562     fn end_addr_before_32bits() {
1563         // On volteer, type16 (coreboot) region is at 0x00000000769f3000-0x0000000076ffffff.
1564         // On brya, type16 region is at 0x0000000076876000-0x00000000803fffff
1565         let brya_type16_address = 0x7687_6000;
1566         assert!(
1567             END_ADDR_BEFORE_32BITS < brya_type16_address,
1568             "{} < {}",
1569             END_ADDR_BEFORE_32BITS,
1570             brya_type16_address
1571         );
1572     }
1573 
1574     #[test]
check_32bit_gap_size_alignment()1575     fn check_32bit_gap_size_alignment() {
1576         // 32bit gap memory is 256 MB aligned to be friendly for MTRR mappings.
1577         assert_eq!(MEM_32BIT_GAP_SIZE % (256 * MB), 0);
1578     }
1579 }
1580