1 // Copyright 2017 The Chromium OS Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #![cfg(any(target_arch = "x86", target_arch = "x86_64"))]
6
7 mod fdt;
8
9 const SETUP_DTB: u32 = 2;
10 const X86_64_FDT_MAX_SIZE: u64 = 0x20_0000;
11
12 #[allow(dead_code)]
13 #[allow(non_upper_case_globals)]
14 #[allow(non_camel_case_types)]
15 #[allow(non_snake_case)]
16 mod bootparam;
17
18 // boot_params is just a series of ints, it is safe to initialize it.
19 unsafe impl data_model::DataInit for bootparam::boot_params {}
20
21 #[allow(dead_code)]
22 #[allow(non_upper_case_globals)]
23 mod msr_index;
24
25 #[allow(dead_code)]
26 #[allow(non_upper_case_globals)]
27 #[allow(non_camel_case_types)]
28 #[allow(clippy::all)]
29 mod mpspec;
30 // These mpspec types are only data, reading them from data is a safe initialization.
31 unsafe impl data_model::DataInit for mpspec::mpc_bus {}
32 unsafe impl data_model::DataInit for mpspec::mpc_cpu {}
33 unsafe impl data_model::DataInit for mpspec::mpc_intsrc {}
34 unsafe impl data_model::DataInit for mpspec::mpc_ioapic {}
35 unsafe impl data_model::DataInit for mpspec::mpc_table {}
36 unsafe impl data_model::DataInit for mpspec::mpc_lintsrc {}
37 unsafe impl data_model::DataInit for mpspec::mpf_intel {}
38
39 mod acpi;
40 mod bzimage;
41 mod cpuid;
42 mod gdt;
43 mod interrupts;
44 mod mptable;
45 mod regs;
46 mod smbios;
47
48 use std::collections::BTreeMap;
49 use std::convert::TryFrom;
50 use std::ffi::{CStr, CString};
51 use std::fs::File;
52 use std::io::{self, Seek};
53 use std::mem;
54 use std::sync::Arc;
55
56 use crate::bootparam::boot_params;
57 use acpi_tables::sdt::SDT;
58 use acpi_tables::{aml, aml::Aml};
59 use arch::{get_serial_cmdline, GetSerialCmdlineError, RunnableLinuxVm, VmComponents, VmImage};
60 use base::{warn, Event};
61 use devices::serial_device::{SerialHardware, SerialParameters};
62 use devices::{
63 BusDeviceObj, BusResumeDevice, IrqChip, IrqChipX86_64, PciAddress, PciConfigIo, PciConfigMmio,
64 PciDevice, PciVirtualConfigMmio,
65 };
66 use hypervisor::{HypervisorX86_64, ProtectionType, VcpuX86_64, Vm, VmX86_64};
67 use minijail::Minijail;
68 use remain::sorted;
69 use resources::{MemRegion, SystemAllocator, SystemAllocatorConfig};
70 use sync::Mutex;
71 use thiserror::Error;
72 use vm_control::{BatControl, BatteryType};
73 use vm_memory::{GuestAddress, GuestMemory, GuestMemoryError};
74 #[cfg(all(target_arch = "x86_64", feature = "gdb"))]
75 use {
76 gdbstub_arch::x86::reg::{X86SegmentRegs, X86_64CoreRegs},
77 hypervisor::x86_64::{Regs, Sregs},
78 };
79
80 #[sorted]
81 #[derive(Error, Debug)]
82 pub enum Error {
83 #[error("error allocating IO resource: {0}")]
84 AllocateIOResouce(resources::Error),
85 #[error("error allocating a single irq")]
86 AllocateIrq,
87 #[error("unable to clone an Event: {0}")]
88 CloneEvent(base::Error),
89 #[error("failed to clone IRQ chip: {0}")]
90 CloneIrqChip(base::Error),
91 #[error("the given kernel command line was invalid: {0}")]
92 Cmdline(kernel_cmdline::Error),
93 #[error("failed to configure hotplugged pci device: {0}")]
94 ConfigurePciDevice(arch::DeviceRegistrationError),
95 #[error("error configuring the system")]
96 ConfigureSystem,
97 #[error("unable to create ACPI tables")]
98 CreateAcpi,
99 #[error("unable to create battery devices: {0}")]
100 CreateBatDevices(arch::DeviceRegistrationError),
101 #[error("unable to make an Event: {0}")]
102 CreateEvent(base::Error),
103 #[error("failed to create fdt: {0}")]
104 CreateFdt(arch::fdt::Error),
105 #[cfg(feature = "direct")]
106 #[error("failed to enable GPE forwarding: {0}")]
107 CreateGpe(devices::DirectIrqError),
108 #[error("failed to create IOAPIC device: {0}")]
109 CreateIoapicDevice(base::Error),
110 #[error("failed to create a PCI root hub: {0}")]
111 CreatePciRoot(arch::DeviceRegistrationError),
112 #[error("unable to create PIT: {0}")]
113 CreatePit(base::Error),
114 #[error("unable to make PIT device: {0}")]
115 CreatePitDevice(devices::PitError),
116 #[error("unable to create serial devices: {0}")]
117 CreateSerialDevices(arch::DeviceRegistrationError),
118 #[error("failed to create socket: {0}")]
119 CreateSocket(io::Error),
120 #[error("failed to create VCPU: {0}")]
121 CreateVcpu(base::Error),
122 #[error("invalid e820 setup params")]
123 E820Configuration,
124 #[error("failed to enable singlestep execution: {0}")]
125 EnableSinglestep(base::Error),
126 #[error("failed to enable split irqchip: {0}")]
127 EnableSplitIrqchip(base::Error),
128 #[error("failed to get serial cmdline: {0}")]
129 GetSerialCmdline(GetSerialCmdlineError),
130 #[error("the kernel extends past the end of RAM")]
131 KernelOffsetPastEnd,
132 #[error("error loading bios: {0}")]
133 LoadBios(io::Error),
134 #[error("error loading kernel bzImage: {0}")]
135 LoadBzImage(bzimage::Error),
136 #[error("error loading command line: {0}")]
137 LoadCmdline(kernel_loader::Error),
138 #[error("error loading initrd: {0}")]
139 LoadInitrd(arch::LoadImageError),
140 #[error("error loading Kernel: {0}")]
141 LoadKernel(kernel_loader::Error),
142 #[error("error translating address: Page not present")]
143 PageNotPresent,
144 #[error("error reading guest memory {0}")]
145 ReadingGuestMemory(vm_memory::GuestMemoryError),
146 #[error("error reading CPU registers {0}")]
147 ReadRegs(base::Error),
148 #[error("error registering an IrqFd: {0}")]
149 RegisterIrqfd(base::Error),
150 #[error("error registering virtual socket device: {0}")]
151 RegisterVsock(arch::DeviceRegistrationError),
152 #[error("failed to set a hardware breakpoint: {0}")]
153 SetHwBreakpoint(base::Error),
154 #[error("failed to set interrupts: {0}")]
155 SetLint(interrupts::Error),
156 #[error("failed to set tss addr: {0}")]
157 SetTssAddr(base::Error),
158 #[error("failed to set up cpuid: {0}")]
159 SetupCpuid(cpuid::Error),
160 #[error("failed to set up FPU: {0}")]
161 SetupFpu(regs::Error),
162 #[error("failed to set up guest memory: {0}")]
163 SetupGuestMemory(GuestMemoryError),
164 #[error("failed to set up mptable: {0}")]
165 SetupMptable(mptable::Error),
166 #[error("failed to set up MSRs: {0}")]
167 SetupMsrs(regs::Error),
168 #[error("failed to set up registers: {0}")]
169 SetupRegs(regs::Error),
170 #[error("failed to set up SMBIOS: {0}")]
171 SetupSmbios(smbios::Error),
172 #[error("failed to set up sregs: {0}")]
173 SetupSregs(regs::Error),
174 #[error("failed to translate virtual address")]
175 TranslatingVirtAddr,
176 #[error("protected VMs not supported on x86_64")]
177 UnsupportedProtectionType,
178 #[error("error writing CPU registers {0}")]
179 WriteRegs(base::Error),
180 #[error("error writing guest memory {0}")]
181 WritingGuestMemory(GuestMemoryError),
182 #[error("the zero page extends past the end of guest_mem")]
183 ZeroPagePastRamEnd,
184 #[error("error writing the zero page of guest memory")]
185 ZeroPageSetup,
186 }
187
188 pub type Result<T> = std::result::Result<T, Error>;
189
190 pub struct X8664arch;
191
192 enum E820Type {
193 Ram = 0x01,
194 Reserved = 0x2,
195 }
196
197 const MB: u64 = 1 << 20;
198 const GB: u64 = 1 << 30;
199
200 const BOOT_STACK_POINTER: u64 = 0x8000;
201 // Make sure it align to 256MB for MTRR convenient
202 const MEM_32BIT_GAP_SIZE: u64 = if cfg!(feature = "direct") {
203 // Allow space for identity mapping coreboot memory regions on the host
204 // which is found at around 7a00_0000 (little bit before 2GB)
205 //
206 // TODO(b/188011323): stop hardcoding sizes and addresses here and instead
207 // determine the memory map from how the VM has been configured via the
208 // command line.
209 2560 * MB
210 } else {
211 768 * MB
212 };
213 const START_OF_RAM_32BITS: u64 = if cfg!(feature = "direct") { 0x1000 } else { 0 };
214 const FIRST_ADDR_PAST_32BITS: u64 = 1 << 32;
215 // Reserved memory for nand_bios/LAPIC/IOAPIC/HPET/.....
216 const RESERVED_MEM_SIZE: u64 = 0x800_0000;
217 // Reserve 64MB for pcie enhanced configuration
218 const PCIE_CFG_MMIO_SIZE: u64 = 0x400_0000;
219 const PCIE_CFG_MMIO_START: u64 = FIRST_ADDR_PAST_32BITS - RESERVED_MEM_SIZE - PCIE_CFG_MMIO_SIZE;
220 // Reserve memory region for pcie virtual configuration
221 const PCIE_VCFG_MMIO_SIZE: u64 = PCIE_CFG_MMIO_SIZE;
222 const END_ADDR_BEFORE_32BITS: u64 = FIRST_ADDR_PAST_32BITS - MEM_32BIT_GAP_SIZE;
223 const PCI_MMIO_SIZE: u64 = MEM_32BIT_GAP_SIZE - RESERVED_MEM_SIZE - PCIE_CFG_MMIO_SIZE;
224 // Linux (with 4-level paging) has a physical memory limit of 46 bits (64 TiB).
225 const HIGH_MMIO_MAX_END: u64 = 1u64 << 46;
226 const KERNEL_64BIT_ENTRY_OFFSET: u64 = 0x200;
227 const ZERO_PAGE_OFFSET: u64 = 0x7000;
228 const TSS_ADDR: u64 = 0xfffb_d000;
229
230 const KERNEL_START_OFFSET: u64 = 0x20_0000;
231 const CMDLINE_OFFSET: u64 = 0x2_0000;
232 const CMDLINE_MAX_SIZE: u64 = KERNEL_START_OFFSET - CMDLINE_OFFSET;
233 const X86_64_SERIAL_1_3_IRQ: u32 = 4;
234 const X86_64_SERIAL_2_4_IRQ: u32 = 3;
235 // X86_64_SCI_IRQ is used to fill the ACPI FACP table.
236 // The sci_irq number is better to be a legacy
237 // IRQ number which is less than 16(actually most of the
238 // platforms have fixed IRQ number 9). So we can
239 // reserve the IRQ number 5 for SCI and let the
240 // the other devices starts from next.
241 pub const X86_64_SCI_IRQ: u32 = 5;
242 // The CMOS RTC uses IRQ 8; start allocating IRQs at 9.
243 pub const X86_64_IRQ_BASE: u32 = 9;
244 const ACPI_HI_RSDP_WINDOW_BASE: u64 = 0x000E_0000;
245
246 /// The x86 reset vector for i386+ and x86_64 puts the processor into an "unreal mode" where it
247 /// can access the last 1 MB of the 32-bit address space in 16-bit mode, and starts the instruction
248 /// pointer at the effective physical address 0xFFFF_FFF0.
bios_start(bios_size: u64) -> GuestAddress249 fn bios_start(bios_size: u64) -> GuestAddress {
250 GuestAddress(FIRST_ADDR_PAST_32BITS - bios_size)
251 }
252
configure_system( guest_mem: &GuestMemory, kernel_addr: GuestAddress, cmdline_addr: GuestAddress, cmdline_size: usize, setup_data: Option<GuestAddress>, initrd: Option<(GuestAddress, usize)>, mut params: boot_params, ) -> Result<()>253 fn configure_system(
254 guest_mem: &GuestMemory,
255 kernel_addr: GuestAddress,
256 cmdline_addr: GuestAddress,
257 cmdline_size: usize,
258 setup_data: Option<GuestAddress>,
259 initrd: Option<(GuestAddress, usize)>,
260 mut params: boot_params,
261 ) -> Result<()> {
262 const EBDA_START: u64 = 0x0009_fc00;
263 const KERNEL_BOOT_FLAG_MAGIC: u16 = 0xaa55;
264 const KERNEL_HDR_MAGIC: u32 = 0x5372_6448;
265 const KERNEL_LOADER_OTHER: u8 = 0xff;
266 const KERNEL_MIN_ALIGNMENT_BYTES: u32 = 0x100_0000; // Must be non-zero.
267 let first_addr_past_32bits = GuestAddress(FIRST_ADDR_PAST_32BITS);
268 let end_32bit_gap_start = GuestAddress(END_ADDR_BEFORE_32BITS);
269
270 params.hdr.type_of_loader = KERNEL_LOADER_OTHER;
271 params.hdr.boot_flag = KERNEL_BOOT_FLAG_MAGIC;
272 params.hdr.header = KERNEL_HDR_MAGIC;
273 params.hdr.cmd_line_ptr = cmdline_addr.offset() as u32;
274 params.hdr.cmdline_size = cmdline_size as u32;
275 params.hdr.kernel_alignment = KERNEL_MIN_ALIGNMENT_BYTES;
276 if let Some(setup_data) = setup_data {
277 params.hdr.setup_data = setup_data.offset();
278 }
279 if let Some((initrd_addr, initrd_size)) = initrd {
280 params.hdr.ramdisk_image = initrd_addr.offset() as u32;
281 params.hdr.ramdisk_size = initrd_size as u32;
282 }
283
284 add_e820_entry(
285 &mut params,
286 START_OF_RAM_32BITS,
287 EBDA_START - START_OF_RAM_32BITS,
288 E820Type::Ram,
289 )?;
290
291 let mem_end = guest_mem.end_addr();
292 if mem_end < end_32bit_gap_start {
293 add_e820_entry(
294 &mut params,
295 kernel_addr.offset() as u64,
296 mem_end.offset_from(kernel_addr) as u64,
297 E820Type::Ram,
298 )?;
299 } else {
300 add_e820_entry(
301 &mut params,
302 kernel_addr.offset() as u64,
303 end_32bit_gap_start.offset_from(kernel_addr) as u64,
304 E820Type::Ram,
305 )?;
306 if mem_end > first_addr_past_32bits {
307 add_e820_entry(
308 &mut params,
309 first_addr_past_32bits.offset() as u64,
310 mem_end.offset_from(first_addr_past_32bits) as u64,
311 E820Type::Ram,
312 )?;
313 }
314 }
315
316 add_e820_entry(
317 &mut params,
318 PCIE_CFG_MMIO_START,
319 PCIE_CFG_MMIO_SIZE,
320 E820Type::Reserved,
321 )?;
322
323 add_e820_entry(
324 &mut params,
325 X8664arch::get_pcie_vcfg_mmio_base(guest_mem),
326 PCIE_VCFG_MMIO_SIZE,
327 E820Type::Reserved,
328 )?;
329
330 let zero_page_addr = GuestAddress(ZERO_PAGE_OFFSET);
331 guest_mem
332 .checked_offset(zero_page_addr, mem::size_of::<boot_params>() as u64)
333 .ok_or(Error::ZeroPagePastRamEnd)?;
334 guest_mem
335 .write_obj_at_addr(params, zero_page_addr)
336 .map_err(|_| Error::ZeroPageSetup)?;
337
338 Ok(())
339 }
340
341 /// Add an e820 region to the e820 map.
342 /// Returns Ok(()) if successful, or an error if there is no space left in the map.
add_e820_entry( params: &mut boot_params, addr: u64, size: u64, mem_type: E820Type, ) -> Result<()>343 fn add_e820_entry(
344 params: &mut boot_params,
345 addr: u64,
346 size: u64,
347 mem_type: E820Type,
348 ) -> Result<()> {
349 if params.e820_entries >= params.e820_table.len() as u8 {
350 return Err(Error::E820Configuration);
351 }
352
353 params.e820_table[params.e820_entries as usize].addr = addr;
354 params.e820_table[params.e820_entries as usize].size = size;
355 params.e820_table[params.e820_entries as usize].type_ = mem_type as u32;
356 params.e820_entries += 1;
357
358 Ok(())
359 }
360
361 /// Returns a Vec of the valid memory addresses.
362 /// These should be used to configure the GuestMemory structure for the platform.
363 /// For x86_64 all addresses are valid from the start of the kernel except a
364 /// carve out at the end of 32bit address space.
arch_memory_regions(size: u64, bios_size: Option<u64>) -> Vec<(GuestAddress, u64)>365 fn arch_memory_regions(size: u64, bios_size: Option<u64>) -> Vec<(GuestAddress, u64)> {
366 let mem_start = START_OF_RAM_32BITS;
367 let mem_end = GuestAddress(size + mem_start);
368 let first_addr_past_32bits = GuestAddress(FIRST_ADDR_PAST_32BITS);
369 let end_32bit_gap_start = GuestAddress(END_ADDR_BEFORE_32BITS);
370 let mut regions = Vec::new();
371 if mem_end <= end_32bit_gap_start {
372 regions.push((GuestAddress(mem_start), size));
373 if let Some(bios_size) = bios_size {
374 regions.push((bios_start(bios_size), bios_size));
375 }
376 } else {
377 regions.push((
378 GuestAddress(mem_start),
379 end_32bit_gap_start.offset() - mem_start,
380 ));
381 if let Some(bios_size) = bios_size {
382 regions.push((bios_start(bios_size), bios_size));
383 }
384 regions.push((
385 first_addr_past_32bits,
386 mem_end.offset_from(end_32bit_gap_start),
387 ));
388 }
389
390 regions
391 }
392
393 impl arch::LinuxArch for X8664arch {
394 type Error = Error;
395
guest_memory_layout( components: &VmComponents, ) -> std::result::Result<Vec<(GuestAddress, u64)>, Self::Error>396 fn guest_memory_layout(
397 components: &VmComponents,
398 ) -> std::result::Result<Vec<(GuestAddress, u64)>, Self::Error> {
399 let bios_size = match &components.vm_image {
400 VmImage::Bios(bios_file) => Some(bios_file.metadata().map_err(Error::LoadBios)?.len()),
401 VmImage::Kernel(_) => None,
402 };
403 Ok(arch_memory_regions(components.memory_size, bios_size))
404 }
405
get_system_allocator_config<V: Vm>(vm: &V) -> SystemAllocatorConfig406 fn get_system_allocator_config<V: Vm>(vm: &V) -> SystemAllocatorConfig {
407 let guest_mem = vm.get_memory();
408 let high_mmio_start = Self::get_high_mmio_base(guest_mem);
409 let high_mmio_size = Self::get_high_mmio_size(vm);
410 SystemAllocatorConfig {
411 io: Some(MemRegion {
412 base: 0xc000,
413 size: 0x4000,
414 }),
415 low_mmio: MemRegion {
416 base: END_ADDR_BEFORE_32BITS,
417 size: PCI_MMIO_SIZE,
418 },
419 high_mmio: MemRegion {
420 base: high_mmio_start,
421 size: high_mmio_size,
422 },
423 platform_mmio: None,
424 first_irq: X86_64_IRQ_BASE,
425 }
426 }
427
build_vm<V, Vcpu>( mut components: VmComponents, exit_evt: &Event, reset_evt: &Event, system_allocator: &mut SystemAllocator, serial_parameters: &BTreeMap<(SerialHardware, u8), SerialParameters>, serial_jail: Option<Minijail>, battery: (&Option<BatteryType>, Option<Minijail>), mut vm: V, ramoops_region: Option<arch::pstore::RamoopsRegion>, devs: Vec<(Box<dyn BusDeviceObj>, Option<Minijail>)>, irq_chip: &mut dyn IrqChipX86_64, kvm_vcpu_ids: &mut Vec<usize>, ) -> std::result::Result<RunnableLinuxVm<V, Vcpu>, Self::Error> where V: VmX86_64, Vcpu: VcpuX86_64,428 fn build_vm<V, Vcpu>(
429 mut components: VmComponents,
430 exit_evt: &Event,
431 reset_evt: &Event,
432 system_allocator: &mut SystemAllocator,
433 serial_parameters: &BTreeMap<(SerialHardware, u8), SerialParameters>,
434 serial_jail: Option<Minijail>,
435 battery: (&Option<BatteryType>, Option<Minijail>),
436 mut vm: V,
437 ramoops_region: Option<arch::pstore::RamoopsRegion>,
438 devs: Vec<(Box<dyn BusDeviceObj>, Option<Minijail>)>,
439 irq_chip: &mut dyn IrqChipX86_64,
440 kvm_vcpu_ids: &mut Vec<usize>,
441 ) -> std::result::Result<RunnableLinuxVm<V, Vcpu>, Self::Error>
442 where
443 V: VmX86_64,
444 Vcpu: VcpuX86_64,
445 {
446 if components.protected_vm != ProtectionType::Unprotected {
447 return Err(Error::UnsupportedProtectionType);
448 }
449
450 let mem = vm.get_memory().clone();
451
452 let vcpu_count = components.vcpu_count;
453
454 let tss_addr = GuestAddress(TSS_ADDR);
455 vm.set_tss_addr(tss_addr).map_err(Error::SetTssAddr)?;
456
457 // Use IRQ info in ACPI if provided by the user.
458 let mut noirq = true;
459 let mut mptable = true;
460 let mut sci_irq = X86_64_SCI_IRQ;
461
462 for sdt in components.acpi_sdts.iter() {
463 if sdt.is_signature(b"DSDT") || sdt.is_signature(b"APIC") {
464 noirq = false;
465 } else if sdt.is_signature(b"FACP") {
466 mptable = false;
467 let sci_irq_fadt: u16 = sdt.read(acpi::FADT_FIELD_SCI_INTERRUPT);
468 sci_irq = sci_irq_fadt.into();
469 if !system_allocator.reserve_irq(sci_irq) {
470 warn!("sci irq {} already reserved.", sci_irq);
471 }
472 }
473 }
474
475 let mmio_bus = Arc::new(devices::Bus::new());
476 let io_bus = Arc::new(devices::Bus::new());
477
478 let (pci_devices, _others): (Vec<_>, Vec<_>) = devs
479 .into_iter()
480 .partition(|(dev, _)| dev.as_pci_device().is_some());
481
482 let pci_devices = pci_devices
483 .into_iter()
484 .map(|(dev, jail_orig)| (dev.into_pci_device().unwrap(), jail_orig))
485 .collect();
486
487 let (pci, pci_irqs, pid_debug_label_map) = arch::generate_pci_root(
488 pci_devices,
489 irq_chip.as_irq_chip_mut(),
490 mmio_bus.clone(),
491 io_bus.clone(),
492 system_allocator,
493 &mut vm,
494 4, // Share the four pin interrupts (INTx#)
495 )
496 .map_err(Error::CreatePciRoot)?;
497
498 let pci = Arc::new(Mutex::new(pci));
499 pci.lock().enable_pcie_cfg_mmio(PCIE_CFG_MMIO_START);
500 let pci_cfg = PciConfigIo::new(
501 pci.clone(),
502 reset_evt.try_clone().map_err(Error::CloneEvent)?,
503 );
504 let pci_bus = Arc::new(Mutex::new(pci_cfg));
505 io_bus.insert(pci_bus, 0xcf8, 0x8).unwrap();
506
507 let pcie_cfg_mmio = Arc::new(Mutex::new(PciConfigMmio::new(pci.clone(), 12)));
508 mmio_bus
509 .insert(pcie_cfg_mmio, PCIE_CFG_MMIO_START, PCIE_CFG_MMIO_SIZE)
510 .unwrap();
511
512 let pcie_vcfg_mmio = Arc::new(Mutex::new(PciVirtualConfigMmio::new(pci.clone(), 12)));
513 mmio_bus
514 .insert(
515 pcie_vcfg_mmio,
516 Self::get_pcie_vcfg_mmio_base(&mem),
517 PCIE_VCFG_MMIO_SIZE,
518 )
519 .unwrap();
520
521 // Event used to notify crosvm that guest OS is trying to suspend.
522 let suspend_evt = Event::new().map_err(Error::CreateEvent)?;
523
524 if !components.no_legacy {
525 Self::setup_legacy_devices(
526 &io_bus,
527 irq_chip.pit_uses_speaker_port(),
528 reset_evt.try_clone().map_err(Error::CloneEvent)?,
529 components.memory_size,
530 )?;
531 }
532 Self::setup_serial_devices(
533 components.protected_vm,
534 irq_chip.as_irq_chip_mut(),
535 &io_bus,
536 serial_parameters,
537 serial_jail,
538 )?;
539
540 let mut resume_notify_devices = Vec::new();
541
542 // each bus occupy 1MB mmio for pcie enhanced configuration
543 let max_bus = ((PCIE_CFG_MMIO_SIZE / 0x100000) - 1) as u8;
544
545 let (acpi_dev_resource, bat_control) = Self::setup_acpi_devices(
546 &mem,
547 &io_bus,
548 system_allocator,
549 suspend_evt.try_clone().map_err(Error::CloneEvent)?,
550 exit_evt.try_clone().map_err(Error::CloneEvent)?,
551 components.acpi_sdts,
552 #[cfg(feature = "direct")]
553 &components.direct_gpe,
554 irq_chip.as_irq_chip_mut(),
555 sci_irq,
556 battery,
557 &mmio_bus,
558 max_bus,
559 &mut resume_notify_devices,
560 )?;
561
562 irq_chip
563 .finalize_devices(system_allocator, &io_bus, &mmio_bus)
564 .map_err(Error::RegisterIrqfd)?;
565
566 // All of these bios generated tables are set manually for the benefit of the kernel boot
567 // flow (since there's no BIOS to set it) and for the BIOS boot flow since crosvm doesn't
568 // have a way to pass the BIOS these configs.
569 // This works right now because the only guest BIOS used with crosvm (u-boot) ignores these
570 // tables and the guest OS picks them up.
571 // If another guest does need a way to pass these tables down to it's BIOS, this approach
572 // should be rethought.
573
574 if mptable {
575 // Note that this puts the mptable at 0x9FC00 in guest physical memory.
576 mptable::setup_mptable(&mem, vcpu_count as u8, &pci_irqs)
577 .map_err(Error::SetupMptable)?;
578 }
579 smbios::setup_smbios(&mem, components.dmi_path).map_err(Error::SetupSmbios)?;
580
581 let host_cpus = if components.host_cpu_topology {
582 components.vcpu_affinity.clone()
583 } else {
584 None
585 };
586
587 // TODO (tjeznach) Write RSDP to bootconfig before writing to memory
588 acpi::create_acpi_tables(
589 &mem,
590 vcpu_count as u8,
591 sci_irq,
592 0xcf9,
593 6, // RST_CPU|SYS_RST
594 &acpi_dev_resource,
595 host_cpus,
596 kvm_vcpu_ids,
597 &pci_irqs,
598 PCIE_CFG_MMIO_START,
599 max_bus,
600 components.force_s2idle,
601 )
602 .ok_or(Error::CreateAcpi)?;
603
604 let mut cmdline = Self::get_base_linux_cmdline();
605
606 if noirq {
607 cmdline.insert_str("acpi=noirq").unwrap();
608 }
609
610 get_serial_cmdline(&mut cmdline, serial_parameters, "io")
611 .map_err(Error::GetSerialCmdline)?;
612
613 for param in components.extra_kernel_params {
614 cmdline.insert_str(¶m).map_err(Error::Cmdline)?;
615 }
616
617 if let Some(ramoops_region) = ramoops_region {
618 arch::pstore::add_ramoops_kernel_cmdline(&mut cmdline, &ramoops_region)
619 .map_err(Error::Cmdline)?;
620 }
621
622 match components.vm_image {
623 VmImage::Bios(ref mut bios) => {
624 // Allow a bios to hardcode CMDLINE_OFFSET and read the kernel command line from it.
625 kernel_loader::load_cmdline(
626 &mem,
627 GuestAddress(CMDLINE_OFFSET),
628 &CString::new(cmdline).unwrap(),
629 )
630 .map_err(Error::LoadCmdline)?;
631 Self::load_bios(&mem, bios)?
632 }
633 VmImage::Kernel(ref mut kernel_image) => {
634 // separate out load_kernel from other setup to get a specific error for
635 // kernel loading
636 let (params, kernel_end) = Self::load_kernel(&mem, kernel_image)?;
637
638 Self::setup_system_memory(
639 &mem,
640 &CString::new(cmdline).unwrap(),
641 components.initrd_image,
642 components.android_fstab,
643 kernel_end,
644 params,
645 )?;
646 }
647 }
648
649 Ok(RunnableLinuxVm {
650 vm,
651 vcpu_count,
652 vcpus: None,
653 vcpu_affinity: components.vcpu_affinity,
654 no_smt: components.no_smt,
655 irq_chip: irq_chip.try_box_clone().map_err(Error::CloneIrqChip)?,
656 has_bios: matches!(components.vm_image, VmImage::Bios(_)),
657 io_bus,
658 mmio_bus,
659 pid_debug_label_map,
660 suspend_evt,
661 resume_notify_devices,
662 rt_cpus: components.rt_cpus,
663 delay_rt: components.delay_rt,
664 bat_control,
665 #[cfg(all(target_arch = "x86_64", feature = "gdb"))]
666 gdb: components.gdb,
667 pm: Some(acpi_dev_resource.pm),
668 root_config: pci,
669 hotplug_bus: Vec::new(),
670 })
671 }
672
configure_vcpu<V: Vm>( vm: &V, hypervisor: &dyn HypervisorX86_64, irq_chip: &mut dyn IrqChipX86_64, vcpu: &mut dyn VcpuX86_64, vcpu_id: usize, num_cpus: usize, has_bios: bool, no_smt: bool, host_cpu_topology: bool, ) -> Result<()>673 fn configure_vcpu<V: Vm>(
674 vm: &V,
675 hypervisor: &dyn HypervisorX86_64,
676 irq_chip: &mut dyn IrqChipX86_64,
677 vcpu: &mut dyn VcpuX86_64,
678 vcpu_id: usize,
679 num_cpus: usize,
680 has_bios: bool,
681 no_smt: bool,
682 host_cpu_topology: bool,
683 ) -> Result<()> {
684 cpuid::setup_cpuid(
685 hypervisor,
686 irq_chip,
687 vcpu,
688 vcpu_id,
689 num_cpus,
690 no_smt,
691 host_cpu_topology,
692 )
693 .map_err(Error::SetupCpuid)?;
694
695 if has_bios {
696 return Ok(());
697 }
698
699 let guest_mem = vm.get_memory();
700 let kernel_load_addr = GuestAddress(KERNEL_START_OFFSET);
701 regs::setup_msrs(vm, vcpu, END_ADDR_BEFORE_32BITS).map_err(Error::SetupMsrs)?;
702 let kernel_end = guest_mem
703 .checked_offset(kernel_load_addr, KERNEL_64BIT_ENTRY_OFFSET)
704 .ok_or(Error::KernelOffsetPastEnd)?;
705 regs::setup_regs(
706 vcpu,
707 (kernel_end).offset() as u64,
708 BOOT_STACK_POINTER as u64,
709 ZERO_PAGE_OFFSET as u64,
710 )
711 .map_err(Error::SetupRegs)?;
712 regs::setup_fpu(vcpu).map_err(Error::SetupFpu)?;
713 regs::setup_sregs(guest_mem, vcpu).map_err(Error::SetupSregs)?;
714 interrupts::set_lint(vcpu_id, irq_chip).map_err(Error::SetLint)?;
715
716 Ok(())
717 }
718
register_pci_device<V: VmX86_64, Vcpu: VcpuX86_64>( linux: &mut RunnableLinuxVm<V, Vcpu>, device: Box<dyn PciDevice>, minijail: Option<Minijail>, resources: &mut SystemAllocator, ) -> Result<PciAddress>719 fn register_pci_device<V: VmX86_64, Vcpu: VcpuX86_64>(
720 linux: &mut RunnableLinuxVm<V, Vcpu>,
721 device: Box<dyn PciDevice>,
722 minijail: Option<Minijail>,
723 resources: &mut SystemAllocator,
724 ) -> Result<PciAddress> {
725 let pci_address = arch::configure_pci_device(linux, device, minijail, resources)
726 .map_err(Error::ConfigurePciDevice)?;
727
728 Ok(pci_address)
729 }
730
731 #[cfg(all(target_arch = "x86_64", feature = "gdb"))]
debug_read_registers<T: VcpuX86_64>(vcpu: &T) -> Result<X86_64CoreRegs>732 fn debug_read_registers<T: VcpuX86_64>(vcpu: &T) -> Result<X86_64CoreRegs> {
733 // General registers: RAX, RBX, RCX, RDX, RSI, RDI, RBP, RSP, r8-r15
734 let gregs = vcpu.get_regs().map_err(Error::ReadRegs)?;
735 let regs = [
736 gregs.rax, gregs.rbx, gregs.rcx, gregs.rdx, gregs.rsi, gregs.rdi, gregs.rbp, gregs.rsp,
737 gregs.r8, gregs.r9, gregs.r10, gregs.r11, gregs.r12, gregs.r13, gregs.r14, gregs.r15,
738 ];
739
740 // GDB exposes 32-bit eflags instead of 64-bit rflags.
741 // https://github.com/bminor/binutils-gdb/blob/master/gdb/features/i386/64bit-core.xml
742 let eflags = gregs.rflags as u32;
743 let rip = gregs.rip;
744
745 // Segment registers: CS, SS, DS, ES, FS, GS
746 let sregs = vcpu.get_sregs().map_err(Error::ReadRegs)?;
747 let segments = X86SegmentRegs {
748 cs: sregs.cs.selector as u32,
749 ss: sregs.ss.selector as u32,
750 ds: sregs.ds.selector as u32,
751 es: sregs.es.selector as u32,
752 fs: sregs.fs.selector as u32,
753 gs: sregs.gs.selector as u32,
754 };
755
756 // TODO(keiichiw): Other registers such as FPU, xmm and mxcsr.
757
758 Ok(X86_64CoreRegs {
759 regs,
760 eflags,
761 rip,
762 segments,
763 ..Default::default()
764 })
765 }
766
767 #[cfg(all(target_arch = "x86_64", feature = "gdb"))]
debug_write_registers<T: VcpuX86_64>(vcpu: &T, regs: &X86_64CoreRegs) -> Result<()>768 fn debug_write_registers<T: VcpuX86_64>(vcpu: &T, regs: &X86_64CoreRegs) -> Result<()> {
769 // General purpose registers (RAX, RBX, RCX, RDX, RSI, RDI, RBP, RSP, r8-r15) + RIP + rflags
770 let orig_gregs = vcpu.get_regs().map_err(Error::ReadRegs)?;
771 let gregs = Regs {
772 rax: regs.regs[0],
773 rbx: regs.regs[1],
774 rcx: regs.regs[2],
775 rdx: regs.regs[3],
776 rsi: regs.regs[4],
777 rdi: regs.regs[5],
778 rbp: regs.regs[6],
779 rsp: regs.regs[7],
780 r8: regs.regs[8],
781 r9: regs.regs[9],
782 r10: regs.regs[10],
783 r11: regs.regs[11],
784 r12: regs.regs[12],
785 r13: regs.regs[13],
786 r14: regs.regs[14],
787 r15: regs.regs[15],
788 rip: regs.rip,
789 // Update the lower 32 bits of rflags.
790 rflags: (orig_gregs.rflags & !(u32::MAX as u64)) | (regs.eflags as u64),
791 };
792 vcpu.set_regs(&gregs).map_err(Error::WriteRegs)?;
793
794 // Segment registers: CS, SS, DS, ES, FS, GS
795 // Since GDB care only selectors, we call get_sregs() first.
796 let mut sregs = vcpu.get_sregs().map_err(Error::ReadRegs)?;
797 sregs.cs.selector = regs.segments.cs as u16;
798 sregs.ss.selector = regs.segments.ss as u16;
799 sregs.ds.selector = regs.segments.ds as u16;
800 sregs.es.selector = regs.segments.es as u16;
801 sregs.fs.selector = regs.segments.fs as u16;
802 sregs.gs.selector = regs.segments.gs as u16;
803
804 vcpu.set_sregs(&sregs).map_err(Error::WriteRegs)?;
805
806 // TODO(keiichiw): Other registers such as FPU, xmm and mxcsr.
807
808 Ok(())
809 }
810
811 #[cfg(all(target_arch = "x86_64", feature = "gdb"))]
debug_read_memory<T: VcpuX86_64>( vcpu: &T, guest_mem: &GuestMemory, vaddr: GuestAddress, len: usize, ) -> Result<Vec<u8>>812 fn debug_read_memory<T: VcpuX86_64>(
813 vcpu: &T,
814 guest_mem: &GuestMemory,
815 vaddr: GuestAddress,
816 len: usize,
817 ) -> Result<Vec<u8>> {
818 let sregs = vcpu.get_sregs().map_err(Error::ReadRegs)?;
819 let mut buf = vec![0; len];
820 let mut total_read = 0u64;
821 // Handle reads across page boundaries.
822
823 while total_read < len as u64 {
824 let (paddr, psize) = phys_addr(guest_mem, vaddr.0 + total_read, &sregs)?;
825 let read_len = std::cmp::min(len as u64 - total_read, psize - (paddr & (psize - 1)));
826 guest_mem
827 .get_slice_at_addr(GuestAddress(paddr), read_len as usize)
828 .map_err(Error::ReadingGuestMemory)?
829 .copy_to(&mut buf[total_read as usize..]);
830 total_read += read_len;
831 }
832 Ok(buf)
833 }
834
835 #[cfg(all(target_arch = "x86_64", feature = "gdb"))]
debug_write_memory<T: VcpuX86_64>( vcpu: &T, guest_mem: &GuestMemory, vaddr: GuestAddress, buf: &[u8], ) -> Result<()>836 fn debug_write_memory<T: VcpuX86_64>(
837 vcpu: &T,
838 guest_mem: &GuestMemory,
839 vaddr: GuestAddress,
840 buf: &[u8],
841 ) -> Result<()> {
842 let sregs = vcpu.get_sregs().map_err(Error::ReadRegs)?;
843 let mut total_written = 0u64;
844 // Handle writes across page boundaries.
845 while total_written < buf.len() as u64 {
846 let (paddr, psize) = phys_addr(guest_mem, vaddr.0 + total_written, &sregs)?;
847 let write_len = std::cmp::min(
848 buf.len() as u64 - total_written,
849 psize - (paddr & (psize - 1)),
850 );
851
852 guest_mem
853 .write_all_at_addr(
854 &buf[total_written as usize..(total_written as usize + write_len as usize)],
855 GuestAddress(paddr),
856 )
857 .map_err(Error::WritingGuestMemory)?;
858 total_written += write_len;
859 }
860 Ok(())
861 }
862
863 #[cfg(all(target_arch = "x86_64", feature = "gdb"))]
debug_enable_singlestep<T: VcpuX86_64>(vcpu: &T) -> Result<()>864 fn debug_enable_singlestep<T: VcpuX86_64>(vcpu: &T) -> Result<()> {
865 vcpu.set_guest_debug(&[], true /* enable_singlestep */)
866 .map_err(Error::EnableSinglestep)
867 }
868
869 #[cfg(all(target_arch = "x86_64", feature = "gdb"))]
debug_set_hw_breakpoints<T: VcpuX86_64>( vcpu: &T, breakpoints: &[GuestAddress], ) -> Result<()>870 fn debug_set_hw_breakpoints<T: VcpuX86_64>(
871 vcpu: &T,
872 breakpoints: &[GuestAddress],
873 ) -> Result<()> {
874 vcpu.set_guest_debug(breakpoints, false /* enable_singlestep */)
875 .map_err(Error::SetHwBreakpoint)
876 }
877 }
878
879 #[cfg(all(target_arch = "x86_64", feature = "gdb"))]
880 // return the translated address and the size of the page it resides in.
phys_addr(mem: &GuestMemory, vaddr: u64, sregs: &Sregs) -> Result<(u64, u64)>881 fn phys_addr(mem: &GuestMemory, vaddr: u64, sregs: &Sregs) -> Result<(u64, u64)> {
882 const CR0_PG_MASK: u64 = 1 << 31;
883 const CR4_PAE_MASK: u64 = 1 << 5;
884 const CR4_LA57_MASK: u64 = 1 << 12;
885 const MSR_EFER_LMA: u64 = 1 << 10;
886 // bits 12 through 51 are the address in a PTE.
887 const PTE_ADDR_MASK: u64 = ((1 << 52) - 1) & !0x0fff;
888 const PAGE_PRESENT: u64 = 0x1;
889 const PAGE_PSE_MASK: u64 = 0x1 << 7;
890
891 const PAGE_SIZE_4K: u64 = 4 * 1024;
892 const PAGE_SIZE_2M: u64 = 2 * 1024 * 1024;
893 const PAGE_SIZE_1G: u64 = 1024 * 1024 * 1024;
894
895 fn next_pte(mem: &GuestMemory, curr_table_addr: u64, vaddr: u64, level: usize) -> Result<u64> {
896 let ent: u64 = mem
897 .read_obj_from_addr(GuestAddress(
898 (curr_table_addr & PTE_ADDR_MASK) + page_table_offset(vaddr, level),
899 ))
900 .map_err(|_| Error::TranslatingVirtAddr)?;
901 /* TODO - convert to a trace
902 println!(
903 "level {} vaddr {:x} table-addr {:x} mask {:x} ent {:x} offset {:x}",
904 level,
905 vaddr,
906 curr_table_addr,
907 PTE_ADDR_MASK,
908 ent,
909 page_table_offset(vaddr, level)
910 );
911 */
912 if ent & PAGE_PRESENT == 0 {
913 return Err(Error::PageNotPresent);
914 }
915 Ok(ent)
916 }
917
918 // Get the offset in to the page of `vaddr`.
919 fn page_offset(vaddr: u64, page_size: u64) -> u64 {
920 vaddr & (page_size - 1)
921 }
922
923 // Get the offset in to the page table of the given `level` specified by the virtual `address`.
924 // `level` is 1 through 5 in x86_64 to handle the five levels of paging.
925 fn page_table_offset(addr: u64, level: usize) -> u64 {
926 let offset = (level - 1) * 9 + 12;
927 ((addr >> offset) & 0x1ff) << 3
928 }
929
930 if sregs.cr0 & CR0_PG_MASK == 0 {
931 return Ok((vaddr, PAGE_SIZE_4K));
932 }
933
934 if sregs.cr4 & CR4_PAE_MASK == 0 {
935 return Err(Error::TranslatingVirtAddr);
936 }
937
938 if sregs.efer & MSR_EFER_LMA != 0 {
939 // TODO - check LA57
940 if sregs.cr4 & CR4_LA57_MASK != 0 {}
941 let p4_ent = next_pte(mem, sregs.cr3, vaddr, 4)?;
942 let p3_ent = next_pte(mem, p4_ent, vaddr, 3)?;
943 // TODO check if it's a 1G page with the PSE bit in p2_ent
944 if p3_ent & PAGE_PSE_MASK != 0 {
945 // It's a 1G page with the PSE bit in p3_ent
946 let paddr = p3_ent & PTE_ADDR_MASK | page_offset(vaddr, PAGE_SIZE_1G);
947 return Ok((paddr, PAGE_SIZE_1G));
948 }
949 let p2_ent = next_pte(mem, p3_ent, vaddr, 2)?;
950 if p2_ent & PAGE_PSE_MASK != 0 {
951 // It's a 2M page with the PSE bit in p2_ent
952 let paddr = p2_ent & PTE_ADDR_MASK | page_offset(vaddr, PAGE_SIZE_2M);
953 return Ok((paddr, PAGE_SIZE_2M));
954 }
955 let p1_ent = next_pte(mem, p2_ent, vaddr, 1)?;
956 let paddr = p1_ent & PTE_ADDR_MASK | page_offset(vaddr, PAGE_SIZE_4K);
957 return Ok((paddr, PAGE_SIZE_4K));
958 }
959 Err(Error::TranslatingVirtAddr)
960 }
961
962 // OSC returned status register in CDW1
963 const OSC_STATUS_UNSUPPORT_UUID: u32 = 0x4;
964 // pci host bridge OSC returned control register in CDW3
965 #[allow(dead_code)]
966 const PCI_HB_OSC_CONTROL_PCIE_HP: u32 = 0x1;
967 const PCI_HB_OSC_CONTROL_SHPC_HP: u32 = 0x2;
968 const PCI_HB_OSC_CONTROL_PCIE_PME: u32 = 0x4;
969 const PCI_HB_OSC_CONTROL_PCIE_AER: u32 = 0x8;
970 #[allow(dead_code)]
971 const PCI_HB_OSC_CONTROL_PCIE_CAP: u32 = 0x10;
972
973 struct PciRootOSC {}
974
975 // Method (_OSC, 4, NotSerialized) // _OSC: Operating System Capabilities
976 // {
977 // CreateDWordField (Arg3, Zero, CDW1) // flag and return value
978 // If (Arg0 == ToUUID ("33db4d5b-1ff7-401c-9657-7441c03dd766"))
979 // {
980 // CreateDWordField (Arg3, 8, CDW3) // control field
981 // if ( 0 == (CDW1 & 0x01)) // Query flag ?
982 // {
983 // CDW3 &= !(SHPC_HP | PME | AER)
984 // }
985 // } Else {
986 // CDW1 |= UNSUPPORT_UUID
987 // }
988 // Return (Arg3)
989 // }
990 impl Aml for PciRootOSC {
to_aml_bytes(&self, aml: &mut Vec<u8>)991 fn to_aml_bytes(&self, aml: &mut Vec<u8>) {
992 let osc_uuid = "33DB4D5B-1FF7-401C-9657-7441C03DD766";
993 // virtual pcie root port supports hotplug and pcie cap register only, clear all
994 // the other bits.
995 let mask = !(PCI_HB_OSC_CONTROL_SHPC_HP
996 | PCI_HB_OSC_CONTROL_PCIE_PME
997 | PCI_HB_OSC_CONTROL_PCIE_AER);
998 aml::Method::new(
999 "_OSC".into(),
1000 4,
1001 false,
1002 vec![
1003 &aml::CreateDWordField::new(
1004 &aml::Name::new_field_name("CDW1"),
1005 &aml::Arg(3),
1006 &aml::ZERO,
1007 ),
1008 &aml::If::new(
1009 &aml::Equal::new(&aml::Arg(0), &aml::Uuid::new(osc_uuid)),
1010 vec![
1011 &aml::CreateDWordField::new(
1012 &aml::Name::new_field_name("CDW3"),
1013 &aml::Arg(3),
1014 &(8_u8),
1015 ),
1016 &aml::If::new(
1017 &aml::Equal::new(
1018 &aml::ZERO,
1019 &aml::And::new(
1020 &aml::Local(0),
1021 &aml::Name::new_field_name("CDW1"),
1022 &aml::ONE,
1023 ),
1024 ),
1025 vec![&aml::And::new(
1026 &aml::Name::new_field_name("CDW3"),
1027 &mask,
1028 &aml::Name::new_field_name("CDW3"),
1029 )],
1030 ),
1031 ],
1032 ),
1033 &aml::Else::new(vec![&aml::Or::new(
1034 &aml::Name::new_field_name("CDW1"),
1035 &OSC_STATUS_UNSUPPORT_UUID,
1036 &aml::Name::new_field_name("CDW1"),
1037 )]),
1038 &aml::Return::new(&aml::Arg(3)),
1039 ],
1040 )
1041 .to_aml_bytes(aml)
1042 }
1043 }
1044
1045 impl X8664arch {
1046 /// Loads the bios from an open file.
1047 ///
1048 /// # Arguments
1049 ///
1050 /// * `mem` - The memory to be used by the guest.
1051 /// * `bios_image` - the File object for the specified bios
load_bios(mem: &GuestMemory, bios_image: &mut File) -> Result<()>1052 fn load_bios(mem: &GuestMemory, bios_image: &mut File) -> Result<()> {
1053 let bios_image_length = bios_image
1054 .seek(io::SeekFrom::End(0))
1055 .map_err(Error::LoadBios)?;
1056 if bios_image_length >= FIRST_ADDR_PAST_32BITS {
1057 return Err(Error::LoadBios(io::Error::new(
1058 io::ErrorKind::InvalidData,
1059 format!(
1060 "bios was {} bytes, expected less than {}",
1061 bios_image_length, FIRST_ADDR_PAST_32BITS,
1062 ),
1063 )));
1064 }
1065 bios_image
1066 .seek(io::SeekFrom::Start(0))
1067 .map_err(Error::LoadBios)?;
1068 mem.read_to_memory(
1069 bios_start(bios_image_length),
1070 bios_image,
1071 bios_image_length as usize,
1072 )
1073 .map_err(Error::SetupGuestMemory)?;
1074 Ok(())
1075 }
1076
1077 /// Loads the kernel from an open file.
1078 ///
1079 /// # Arguments
1080 ///
1081 /// * `mem` - The memory to be used by the guest.
1082 /// * `kernel_image` - the File object for the specified kernel.
load_kernel(mem: &GuestMemory, kernel_image: &mut File) -> Result<(boot_params, u64)>1083 fn load_kernel(mem: &GuestMemory, kernel_image: &mut File) -> Result<(boot_params, u64)> {
1084 let elf_result =
1085 kernel_loader::load_kernel(mem, GuestAddress(KERNEL_START_OFFSET), kernel_image);
1086 if elf_result == Err(kernel_loader::Error::InvalidElfMagicNumber) {
1087 bzimage::load_bzimage(mem, GuestAddress(KERNEL_START_OFFSET), kernel_image)
1088 .map_err(Error::LoadBzImage)
1089 } else {
1090 let kernel_end = elf_result.map_err(Error::LoadKernel)?;
1091 Ok((Default::default(), kernel_end))
1092 }
1093 }
1094
1095 /// Configures the system memory space should be called once per vm before
1096 /// starting vcpu threads.
1097 ///
1098 /// # Arguments
1099 ///
1100 /// * `mem` - The memory to be used by the guest.
1101 /// * `cmdline` - the kernel commandline
1102 /// * `initrd_file` - an initial ramdisk image
setup_system_memory( mem: &GuestMemory, cmdline: &CStr, initrd_file: Option<File>, android_fstab: Option<File>, kernel_end: u64, params: boot_params, ) -> Result<()>1103 fn setup_system_memory(
1104 mem: &GuestMemory,
1105 cmdline: &CStr,
1106 initrd_file: Option<File>,
1107 android_fstab: Option<File>,
1108 kernel_end: u64,
1109 params: boot_params,
1110 ) -> Result<()> {
1111 kernel_loader::load_cmdline(mem, GuestAddress(CMDLINE_OFFSET), cmdline)
1112 .map_err(Error::LoadCmdline)?;
1113
1114 // Track the first free address after the kernel - this is where extra
1115 // data like the device tree blob and initrd will be loaded.
1116 let mut free_addr = kernel_end;
1117
1118 let setup_data = if let Some(android_fstab) = android_fstab {
1119 let free_addr_aligned = (((free_addr + 64 - 1) / 64) * 64) + 64;
1120 let dtb_start = GuestAddress(free_addr_aligned);
1121 let dtb_size = fdt::create_fdt(
1122 X86_64_FDT_MAX_SIZE as usize,
1123 mem,
1124 dtb_start.offset(),
1125 android_fstab,
1126 )
1127 .map_err(Error::CreateFdt)?;
1128 free_addr = dtb_start.offset() + dtb_size as u64;
1129 Some(dtb_start)
1130 } else {
1131 None
1132 };
1133
1134 let initrd = match initrd_file {
1135 Some(mut initrd_file) => {
1136 let mut initrd_addr_max = u64::from(params.hdr.initrd_addr_max);
1137 // Default initrd_addr_max for old kernels (see Documentation/x86/boot.txt).
1138 if initrd_addr_max == 0 {
1139 initrd_addr_max = 0x37FFFFFF;
1140 }
1141
1142 let mem_max = mem.end_addr().offset() - 1;
1143 if initrd_addr_max > mem_max {
1144 initrd_addr_max = mem_max;
1145 }
1146
1147 let (initrd_start, initrd_size) = arch::load_image_high(
1148 mem,
1149 &mut initrd_file,
1150 GuestAddress(free_addr),
1151 GuestAddress(initrd_addr_max),
1152 base::pagesize() as u64,
1153 )
1154 .map_err(Error::LoadInitrd)?;
1155 Some((initrd_start, initrd_size))
1156 }
1157 None => None,
1158 };
1159
1160 configure_system(
1161 mem,
1162 GuestAddress(KERNEL_START_OFFSET),
1163 GuestAddress(CMDLINE_OFFSET),
1164 cmdline.to_bytes().len() + 1,
1165 setup_data,
1166 initrd,
1167 params,
1168 )?;
1169 Ok(())
1170 }
1171
get_pcie_vcfg_mmio_base(mem: &GuestMemory) -> u641172 fn get_pcie_vcfg_mmio_base(mem: &GuestMemory) -> u64 {
1173 // Put PCIe VCFG region at a 2MB boundary after physical memory or 4gb, whichever is greater.
1174 let ram_end_round_2mb = (mem.end_addr().offset() + 2 * MB - 1) / (2 * MB) * (2 * MB);
1175 std::cmp::max(ram_end_round_2mb, 4 * GB)
1176 }
1177
1178 /// This returns the start address of high mmio
1179 ///
1180 /// # Arguments
1181 ///
1182 /// * mem: The memory to be used by the guest
get_high_mmio_base(mem: &GuestMemory) -> u641183 fn get_high_mmio_base(mem: &GuestMemory) -> u64 {
1184 Self::get_pcie_vcfg_mmio_base(mem) + PCIE_VCFG_MMIO_SIZE
1185 }
1186
1187 /// This returns the size of high mmio
1188 ///
1189 /// # Arguments
1190 ///
1191 /// * `vm`: The virtual machine
get_high_mmio_size<V: Vm>(vm: &V) -> u641192 fn get_high_mmio_size<V: Vm>(vm: &V) -> u64 {
1193 let phys_mem_end = 1u64 << vm.get_guest_phys_addr_bits();
1194 let high_mmio_end = std::cmp::min(phys_mem_end, HIGH_MMIO_MAX_END);
1195 high_mmio_end - Self::get_high_mmio_base(vm.get_memory())
1196 }
1197
1198 /// This returns a minimal kernel command for this architecture
get_base_linux_cmdline() -> kernel_cmdline::Cmdline1199 fn get_base_linux_cmdline() -> kernel_cmdline::Cmdline {
1200 let mut cmdline = kernel_cmdline::Cmdline::new(CMDLINE_MAX_SIZE as usize);
1201 cmdline.insert_str("panic=-1").unwrap();
1202
1203 cmdline
1204 }
1205
1206 /// Sets up the legacy x86 IO platform devices
1207 ///
1208 /// # Arguments
1209 ///
1210 /// * - `io_bus` - the IO bus object
1211 /// * - `pit_uses_speaker_port` - does the PIT use port 0x61 for the PC speaker
1212 /// * - `reset_evt` - the event object which should receive exit events
1213 /// * - `mem_size` - the size in bytes of physical ram for the guest
setup_legacy_devices( io_bus: &devices::Bus, pit_uses_speaker_port: bool, reset_evt: Event, mem_size: u64, ) -> Result<()>1214 fn setup_legacy_devices(
1215 io_bus: &devices::Bus,
1216 pit_uses_speaker_port: bool,
1217 reset_evt: Event,
1218 mem_size: u64,
1219 ) -> Result<()> {
1220 struct NoDevice;
1221 impl devices::BusDevice for NoDevice {
1222 fn debug_label(&self) -> String {
1223 "no device".to_owned()
1224 }
1225 }
1226
1227 let mem_regions = arch_memory_regions(mem_size, None);
1228
1229 let mem_below_4g = mem_regions
1230 .iter()
1231 .filter(|r| r.0.offset() < FIRST_ADDR_PAST_32BITS)
1232 .map(|r| r.1)
1233 .sum();
1234
1235 let mem_above_4g = mem_regions
1236 .iter()
1237 .filter(|r| r.0.offset() >= FIRST_ADDR_PAST_32BITS)
1238 .map(|r| r.1)
1239 .sum();
1240
1241 io_bus
1242 .insert(
1243 Arc::new(Mutex::new(devices::Cmos::new(mem_below_4g, mem_above_4g))),
1244 0x70,
1245 0x2,
1246 )
1247 .unwrap();
1248
1249 let nul_device = Arc::new(Mutex::new(NoDevice));
1250 let i8042 = Arc::new(Mutex::new(devices::I8042Device::new(
1251 reset_evt.try_clone().map_err(Error::CloneEvent)?,
1252 )));
1253
1254 if pit_uses_speaker_port {
1255 io_bus.insert(i8042, 0x062, 0x3).unwrap();
1256 } else {
1257 io_bus.insert(i8042, 0x061, 0x4).unwrap();
1258 }
1259
1260 io_bus.insert(nul_device.clone(), 0x0ed, 0x1).unwrap(); // most likely this one does nothing
1261 io_bus.insert(nul_device, 0x0f0, 0x2).unwrap(); // ignore fpu
1262
1263 Ok(())
1264 }
1265
1266 /// Sets up the acpi devices for this platform and
1267 /// return the resources which is used to set the ACPI tables.
1268 ///
1269 /// # Arguments
1270 ///
1271 /// * - `io_bus` the I/O bus to add the devices to
1272 /// * - `resources` the SystemAllocator to allocate IO and MMIO for acpi
1273 /// devices.
1274 /// * - `suspend_evt` the event object which used to suspend the vm
1275 /// * - `sdts` ACPI system description tables
1276 /// * - `irq_chip` the IrqChip object for registering irq events
1277 /// * - `battery` indicate whether to create the battery
1278 /// * - `mmio_bus` the MMIO bus to add the devices to
setup_acpi_devices( mem: &GuestMemory, io_bus: &devices::Bus, resources: &mut SystemAllocator, suspend_evt: Event, exit_evt: Event, sdts: Vec<SDT>, #[cfg(feature = "direct")] direct_gpe: &[u32], irq_chip: &mut dyn IrqChip, sci_irq: u32, battery: (&Option<BatteryType>, Option<Minijail>), mmio_bus: &devices::Bus, max_bus: u8, resume_notify_devices: &mut Vec<Arc<Mutex<dyn BusResumeDevice>>>, ) -> Result<(acpi::AcpiDevResource, Option<BatControl>)>1279 fn setup_acpi_devices(
1280 mem: &GuestMemory,
1281 io_bus: &devices::Bus,
1282 resources: &mut SystemAllocator,
1283 suspend_evt: Event,
1284 exit_evt: Event,
1285 sdts: Vec<SDT>,
1286 #[cfg(feature = "direct")] direct_gpe: &[u32],
1287 irq_chip: &mut dyn IrqChip,
1288 sci_irq: u32,
1289 battery: (&Option<BatteryType>, Option<Minijail>),
1290 mmio_bus: &devices::Bus,
1291 max_bus: u8,
1292 resume_notify_devices: &mut Vec<Arc<Mutex<dyn BusResumeDevice>>>,
1293 ) -> Result<(acpi::AcpiDevResource, Option<BatControl>)> {
1294 // The AML data for the acpi devices
1295 let mut amls = Vec::new();
1296
1297 let bat_control = if let Some(battery_type) = battery.0 {
1298 match battery_type {
1299 BatteryType::Goldfish => {
1300 let control_tube = arch::add_goldfish_battery(
1301 &mut amls, battery.1, mmio_bus, irq_chip, sci_irq, resources,
1302 )
1303 .map_err(Error::CreateBatDevices)?;
1304 Some(BatControl {
1305 type_: BatteryType::Goldfish,
1306 control_tube,
1307 })
1308 }
1309 }
1310 } else {
1311 None
1312 };
1313
1314 let pm_alloc = resources.get_anon_alloc();
1315 let pm_iobase = match resources.io_allocator() {
1316 Some(io) => io
1317 .allocate_with_align(
1318 devices::acpi::ACPIPM_RESOURCE_LEN as u64,
1319 pm_alloc,
1320 "ACPIPM".to_string(),
1321 4, // must be 32-bit aligned
1322 )
1323 .map_err(Error::AllocateIOResouce)?,
1324 None => 0x600,
1325 };
1326
1327 let pcie_vcfg = aml::Name::new("VCFG".into(), &Self::get_pcie_vcfg_mmio_base(mem));
1328 pcie_vcfg.to_aml_bytes(&mut amls);
1329
1330 let pm_sci_evt = devices::IrqLevelEvent::new().map_err(Error::CreateEvent)?;
1331 irq_chip
1332 .register_level_irq_event(sci_irq, &pm_sci_evt)
1333 .map_err(Error::RegisterIrqfd)?;
1334
1335 #[cfg(feature = "direct")]
1336 let direct_gpe_info = if direct_gpe.is_empty() {
1337 None
1338 } else {
1339 let direct_sci_evt = devices::IrqLevelEvent::new().map_err(Error::CreateEvent)?;
1340 let mut sci_devirq =
1341 devices::DirectIrq::new_level(&direct_sci_evt).map_err(Error::CreateGpe)?;
1342
1343 sci_devirq.sci_irq_prepare().map_err(Error::CreateGpe)?;
1344
1345 for gpe in direct_gpe {
1346 sci_devirq
1347 .gpe_enable_forwarding(*gpe)
1348 .map_err(Error::CreateGpe)?;
1349 }
1350
1351 Some((direct_sci_evt, direct_gpe))
1352 };
1353
1354 let mut pmresource = devices::ACPIPMResource::new(
1355 pm_sci_evt,
1356 #[cfg(feature = "direct")]
1357 direct_gpe_info,
1358 suspend_evt,
1359 exit_evt,
1360 );
1361 pmresource.to_aml_bytes(&mut amls);
1362 pmresource.start();
1363
1364 let mut crs_entries: Vec<Box<dyn Aml>> = vec![
1365 Box::new(aml::AddressSpace::new_bus_number(0x0u16, max_bus as u16)),
1366 Box::new(aml::IO::new(0xcf8, 0xcf8, 1, 0x8)),
1367 ];
1368 for r in resources.mmio_pools() {
1369 let entry: Box<dyn Aml> = match (u32::try_from(*r.start()), u32::try_from(*r.end())) {
1370 (Ok(start), Ok(end)) => Box::new(aml::AddressSpace::new_memory(
1371 aml::AddressSpaceCachable::NotCacheable,
1372 true,
1373 start,
1374 end,
1375 )),
1376 _ => Box::new(aml::AddressSpace::new_memory(
1377 aml::AddressSpaceCachable::NotCacheable,
1378 true,
1379 *r.start(),
1380 *r.end(),
1381 )),
1382 };
1383 crs_entries.push(entry);
1384 }
1385
1386 let mut pci_dsdt_inner_data: Vec<&dyn aml::Aml> = Vec::new();
1387 let hid = aml::Name::new("_HID".into(), &aml::EISAName::new("PNP0A08"));
1388 pci_dsdt_inner_data.push(&hid);
1389 let cid = aml::Name::new("_CID".into(), &aml::EISAName::new("PNP0A03"));
1390 pci_dsdt_inner_data.push(&cid);
1391 let adr = aml::Name::new("_ADR".into(), &aml::ZERO);
1392 pci_dsdt_inner_data.push(&adr);
1393 let seg = aml::Name::new("_SEG".into(), &aml::ZERO);
1394 pci_dsdt_inner_data.push(&seg);
1395 let uid = aml::Name::new("_UID".into(), &aml::ZERO);
1396 pci_dsdt_inner_data.push(&uid);
1397 let supp = aml::Name::new("SUPP".into(), &aml::ZERO);
1398 pci_dsdt_inner_data.push(&supp);
1399 let crs = aml::Name::new(
1400 "_CRS".into(),
1401 &aml::ResourceTemplate::new(crs_entries.iter().map(|b| b.as_ref()).collect()),
1402 );
1403 pci_dsdt_inner_data.push(&crs);
1404
1405 let pci_root_osc = PciRootOSC {};
1406 pci_dsdt_inner_data.push(&pci_root_osc);
1407
1408 aml::Device::new("_SB_.PCI0".into(), pci_dsdt_inner_data).to_aml_bytes(&mut amls);
1409
1410 let pm = Arc::new(Mutex::new(pmresource));
1411 io_bus
1412 .insert(
1413 pm.clone(),
1414 pm_iobase as u64,
1415 devices::acpi::ACPIPM_RESOURCE_LEN as u64,
1416 )
1417 .unwrap();
1418 resume_notify_devices.push(pm.clone());
1419
1420 Ok((
1421 acpi::AcpiDevResource {
1422 amls,
1423 pm_iobase,
1424 pm,
1425 sdts,
1426 },
1427 bat_control,
1428 ))
1429 }
1430
1431 /// Sets up the serial devices for this platform. Returns the serial port number and serial
1432 /// device to be used for stdout
1433 ///
1434 /// # Arguments
1435 ///
1436 /// * - `irq_chip` the IrqChip object for registering irq events
1437 /// * - `io_bus` the I/O bus to add the devices to
1438 /// * - `serial_parmaters` - definitions for how the serial devices should be configured
setup_serial_devices( protected_vm: ProtectionType, irq_chip: &mut dyn IrqChip, io_bus: &devices::Bus, serial_parameters: &BTreeMap<(SerialHardware, u8), SerialParameters>, serial_jail: Option<Minijail>, ) -> Result<()>1439 fn setup_serial_devices(
1440 protected_vm: ProtectionType,
1441 irq_chip: &mut dyn IrqChip,
1442 io_bus: &devices::Bus,
1443 serial_parameters: &BTreeMap<(SerialHardware, u8), SerialParameters>,
1444 serial_jail: Option<Minijail>,
1445 ) -> Result<()> {
1446 let com_evt_1_3 = devices::IrqEdgeEvent::new().map_err(Error::CreateEvent)?;
1447 let com_evt_2_4 = devices::IrqEdgeEvent::new().map_err(Error::CreateEvent)?;
1448
1449 arch::add_serial_devices(
1450 protected_vm,
1451 io_bus,
1452 com_evt_1_3.get_trigger(),
1453 com_evt_2_4.get_trigger(),
1454 serial_parameters,
1455 serial_jail,
1456 )
1457 .map_err(Error::CreateSerialDevices)?;
1458
1459 irq_chip
1460 .register_edge_irq_event(X86_64_SERIAL_1_3_IRQ, &com_evt_1_3)
1461 .map_err(Error::RegisterIrqfd)?;
1462 irq_chip
1463 .register_edge_irq_event(X86_64_SERIAL_2_4_IRQ, &com_evt_2_4)
1464 .map_err(Error::RegisterIrqfd)?;
1465
1466 Ok(())
1467 }
1468 }
1469
1470 #[cfg(test)]
1471 mod test_integration;
1472
1473 #[cfg(test)]
1474 mod tests {
1475 use super::*;
1476
1477 #[test]
regions_lt_4gb_nobios()1478 fn regions_lt_4gb_nobios() {
1479 let regions = arch_memory_regions(512 * MB, /* bios_size */ None);
1480 assert_eq!(1, regions.len());
1481 assert_eq!(GuestAddress(START_OF_RAM_32BITS), regions[0].0);
1482 assert_eq!(1u64 << 29, regions[0].1);
1483 }
1484
1485 #[test]
regions_gt_4gb_nobios()1486 fn regions_gt_4gb_nobios() {
1487 let size = 4 * GB + 0x8000;
1488 let regions = arch_memory_regions(size, /* bios_size */ None);
1489 assert_eq!(2, regions.len());
1490 assert_eq!(GuestAddress(START_OF_RAM_32BITS), regions[0].0);
1491 assert_eq!(GuestAddress(4 * GB), regions[1].0);
1492 assert_eq!(4 * GB + 0x8000, regions[0].1 + regions[1].1);
1493 }
1494
1495 #[test]
regions_lt_4gb_bios()1496 fn regions_lt_4gb_bios() {
1497 let bios_len = 1 * MB;
1498 let regions = arch_memory_regions(512 * MB, Some(bios_len));
1499 assert_eq!(2, regions.len());
1500 assert_eq!(GuestAddress(START_OF_RAM_32BITS), regions[0].0);
1501 assert_eq!(512 * MB, regions[0].1);
1502 assert_eq!(
1503 GuestAddress(FIRST_ADDR_PAST_32BITS - bios_len),
1504 regions[1].0
1505 );
1506 assert_eq!(bios_len, regions[1].1);
1507 }
1508
1509 #[test]
regions_gt_4gb_bios()1510 fn regions_gt_4gb_bios() {
1511 let bios_len = 1 * MB;
1512 let regions = arch_memory_regions(4 * GB + 0x8000, Some(bios_len));
1513 assert_eq!(3, regions.len());
1514 assert_eq!(GuestAddress(START_OF_RAM_32BITS), regions[0].0);
1515 assert_eq!(
1516 GuestAddress(FIRST_ADDR_PAST_32BITS - bios_len),
1517 regions[1].0
1518 );
1519 assert_eq!(bios_len, regions[1].1);
1520 assert_eq!(GuestAddress(4 * GB), regions[2].0);
1521 }
1522
1523 #[test]
regions_eq_4gb_nobios()1524 fn regions_eq_4gb_nobios() {
1525 // Test with exact size of 4GB - the overhead.
1526 let regions = arch_memory_regions(
1527 4 * GB - MEM_32BIT_GAP_SIZE - START_OF_RAM_32BITS,
1528 /* bios_size */ None,
1529 );
1530 dbg!(®ions);
1531 assert_eq!(1, regions.len());
1532 assert_eq!(GuestAddress(START_OF_RAM_32BITS), regions[0].0);
1533 assert_eq!(
1534 4 * GB - MEM_32BIT_GAP_SIZE - START_OF_RAM_32BITS,
1535 regions[0].1
1536 );
1537 }
1538
1539 #[test]
regions_eq_4gb_bios()1540 fn regions_eq_4gb_bios() {
1541 // Test with exact size of 4GB - the overhead.
1542 let bios_len = 1 * MB;
1543 let regions = arch_memory_regions(
1544 4 * GB - MEM_32BIT_GAP_SIZE - START_OF_RAM_32BITS,
1545 Some(bios_len),
1546 );
1547 assert_eq!(2, regions.len());
1548 assert_eq!(GuestAddress(START_OF_RAM_32BITS), regions[0].0);
1549 assert_eq!(
1550 4 * GB - MEM_32BIT_GAP_SIZE - START_OF_RAM_32BITS,
1551 regions[0].1
1552 );
1553 assert_eq!(
1554 GuestAddress(FIRST_ADDR_PAST_32BITS - bios_len),
1555 regions[1].0
1556 );
1557 assert_eq!(bios_len, regions[1].1);
1558 }
1559
1560 #[test]
1561 #[cfg(feature = "direct")]
end_addr_before_32bits()1562 fn end_addr_before_32bits() {
1563 // On volteer, type16 (coreboot) region is at 0x00000000769f3000-0x0000000076ffffff.
1564 // On brya, type16 region is at 0x0000000076876000-0x00000000803fffff
1565 let brya_type16_address = 0x7687_6000;
1566 assert!(
1567 END_ADDR_BEFORE_32BITS < brya_type16_address,
1568 "{} < {}",
1569 END_ADDR_BEFORE_32BITS,
1570 brya_type16_address
1571 );
1572 }
1573
1574 #[test]
check_32bit_gap_size_alignment()1575 fn check_32bit_gap_size_alignment() {
1576 // 32bit gap memory is 256 MB aligned to be friendly for MTRR mappings.
1577 assert_eq!(MEM_32BIT_GAP_SIZE % (256 * MB), 0);
1578 }
1579 }
1580