1 // Copyright 2017 The ChromiumOS Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 //! x86 architecture support.
6
7 #![cfg(target_arch = "x86_64")]
8
9 mod fdt;
10
11 const SETUP_DTB: u32 = 2;
12 const SETUP_RNG_SEED: u32 = 9;
13
14 #[allow(dead_code)]
15 #[allow(non_upper_case_globals)]
16 #[allow(non_camel_case_types)]
17 #[allow(non_snake_case)]
18 pub mod bootparam;
19
20 #[allow(dead_code)]
21 #[allow(non_upper_case_globals)]
22 mod msr_index;
23
24 #[allow(dead_code)]
25 #[allow(non_upper_case_globals)]
26 #[allow(non_camel_case_types)]
27 #[allow(clippy::all)]
28 mod mpspec;
29
30 pub mod acpi;
31 mod bzimage;
32 pub mod cpuid;
33 mod gdt;
34 pub mod interrupts;
35 pub mod mptable;
36 pub mod regs;
37 pub mod smbios;
38
39 use std::arch::x86_64::CpuidResult;
40 use std::collections::BTreeMap;
41 use std::ffi::CStr;
42 use std::ffi::CString;
43 use std::fs::File;
44 use std::io;
45 use std::mem;
46 use std::path::PathBuf;
47 use std::sync::mpsc;
48 use std::sync::Arc;
49
50 use acpi_tables::aml;
51 use acpi_tables::aml::Aml;
52 use acpi_tables::sdt::SDT;
53 use anyhow::Context;
54 use arch::get_serial_cmdline;
55 use arch::serial::SerialDeviceInfo;
56 use arch::CpuSet;
57 use arch::DtbOverlay;
58 use arch::GetSerialCmdlineError;
59 use arch::RunnableLinuxVm;
60 use arch::VmComponents;
61 use arch::VmImage;
62 #[cfg(feature = "seccomp_trace")]
63 use base::debug;
64 use base::warn;
65 #[cfg(any(target_os = "android", target_os = "linux"))]
66 use base::AsRawDescriptors;
67 use base::Event;
68 use base::FileGetLen;
69 use base::FileReadWriteAtVolatile;
70 use base::SendTube;
71 use base::Tube;
72 use base::TubeError;
73 use chrono::Utc;
74 pub use cpuid::adjust_cpuid;
75 pub use cpuid::CpuIdContext;
76 use devices::acpi::PM_WAKEUP_GPIO;
77 use devices::Bus;
78 use devices::BusDevice;
79 use devices::BusDeviceObj;
80 use devices::BusResumeDevice;
81 use devices::BusType;
82 use devices::Debugcon;
83 use devices::FwCfgParameters;
84 use devices::IrqChip;
85 use devices::IrqChipX86_64;
86 use devices::IrqEventSource;
87 use devices::PciAddress;
88 use devices::PciConfigIo;
89 use devices::PciConfigMmio;
90 use devices::PciDevice;
91 use devices::PciInterruptPin;
92 use devices::PciRoot;
93 use devices::PciRootCommand;
94 use devices::PciVirtualConfigMmio;
95 use devices::Pflash;
96 #[cfg(any(target_os = "android", target_os = "linux"))]
97 use devices::ProxyDevice;
98 use devices::Serial;
99 use devices::SerialHardware;
100 use devices::SerialParameters;
101 #[cfg(any(target_os = "android", target_os = "linux"))]
102 use devices::VirtualPmc;
103 use devices::FW_CFG_BASE_PORT;
104 use devices::FW_CFG_MAX_FILE_SLOTS;
105 use devices::FW_CFG_WIDTH;
106 #[cfg(feature = "gdb")]
107 use gdbstub_arch::x86::reg::id::X86_64CoreRegId;
108 #[cfg(feature = "gdb")]
109 use gdbstub_arch::x86::reg::X86SegmentRegs;
110 #[cfg(feature = "gdb")]
111 use gdbstub_arch::x86::reg::X86_64CoreRegs;
112 #[cfg(feature = "gdb")]
113 use gdbstub_arch::x86::reg::X87FpuInternalRegs;
114 #[cfg(feature = "gdb")]
115 use hypervisor::x86_64::Regs;
116 #[cfg(feature = "gdb")]
117 use hypervisor::x86_64::Sregs;
118 use hypervisor::CpuConfigX86_64;
119 use hypervisor::Hypervisor;
120 use hypervisor::HypervisorX86_64;
121 use hypervisor::ProtectionType;
122 use hypervisor::VcpuInitX86_64;
123 use hypervisor::VcpuX86_64;
124 use hypervisor::Vm;
125 use hypervisor::VmCap;
126 use hypervisor::VmX86_64;
127 #[cfg(feature = "seccomp_trace")]
128 use jail::read_jail_addr;
129 #[cfg(windows)]
130 use jail::FakeMinijailStub as Minijail;
131 #[cfg(any(target_os = "android", target_os = "linux"))]
132 use minijail::Minijail;
133 use once_cell::sync::OnceCell;
134 use rand::rngs::OsRng;
135 use rand::RngCore;
136 use remain::sorted;
137 use resources::AddressRange;
138 use resources::SystemAllocator;
139 use resources::SystemAllocatorConfig;
140 #[cfg(any(target_os = "android", target_os = "linux"))]
141 use sync::Condvar;
142 use sync::Mutex;
143 use thiserror::Error;
144 use vm_control::BatControl;
145 use vm_control::BatteryType;
146 use vm_memory::GuestAddress;
147 use vm_memory::GuestMemory;
148 use vm_memory::GuestMemoryError;
149 use vm_memory::MemoryRegionOptions;
150 use zerocopy::AsBytes;
151 use zerocopy::FromBytes;
152 use zerocopy::FromZeroes;
153
154 use crate::bootparam::boot_params;
155 use crate::cpuid::EDX_HYBRID_CPU_SHIFT;
156
157 #[sorted]
158 #[derive(Error, Debug)]
159 pub enum Error {
160 #[error("error allocating a single gpe")]
161 AllocateGpe,
162 #[error("error allocating IO resource: {0}")]
163 AllocateIOResouce(resources::Error),
164 #[error("error allocating a single irq")]
165 AllocateIrq,
166 #[error("unable to clone an Event: {0}")]
167 CloneEvent(base::Error),
168 #[error("failed to clone IRQ chip: {0}")]
169 CloneIrqChip(base::Error),
170 #[cfg(any(target_os = "android", target_os = "linux"))]
171 #[error("failed to clone jail: {0}")]
172 CloneJail(minijail::Error),
173 #[error("unable to clone a Tube: {0}")]
174 CloneTube(TubeError),
175 #[error("the given kernel command line was invalid: {0}")]
176 Cmdline(kernel_cmdline::Error),
177 #[error("failed to configure hotplugged pci device: {0}")]
178 ConfigurePciDevice(arch::DeviceRegistrationError),
179 #[error("failed to configure segment registers: {0}")]
180 ConfigureSegments(regs::Error),
181 #[error("error configuring the system")]
182 ConfigureSystem,
183 #[error("unable to create ACPI tables")]
184 CreateAcpi,
185 #[error("unable to create battery devices: {0}")]
186 CreateBatDevices(arch::DeviceRegistrationError),
187 #[error("could not create debugcon device: {0}")]
188 CreateDebugconDevice(devices::SerialError),
189 #[error("unable to make an Event: {0}")]
190 CreateEvent(base::Error),
191 #[error("failed to create fdt: {0}")]
192 CreateFdt(cros_fdt::Error),
193 #[error("failed to create fw_cfg device: {0}")]
194 CreateFwCfgDevice(devices::FwCfgError),
195 #[error("failed to create IOAPIC device: {0}")]
196 CreateIoapicDevice(base::Error),
197 #[error("failed to create a PCI root hub: {0}")]
198 CreatePciRoot(arch::DeviceRegistrationError),
199 #[error("unable to create PIT: {0}")]
200 CreatePit(base::Error),
201 #[error("unable to make PIT device: {0}")]
202 CreatePitDevice(devices::PitError),
203 #[cfg(any(target_os = "android", target_os = "linux"))]
204 #[error("unable to create proxy device: {0}")]
205 CreateProxyDevice(devices::ProxyError),
206 #[error("unable to create serial devices: {0}")]
207 CreateSerialDevices(arch::DeviceRegistrationError),
208 #[error("failed to create socket: {0}")]
209 CreateSocket(io::Error),
210 #[error("failed to create VCPU: {0}")]
211 CreateVcpu(base::Error),
212 #[error("failed to create Virtio MMIO bus: {0}")]
213 CreateVirtioMmioBus(arch::DeviceRegistrationError),
214 #[error("invalid e820 setup params")]
215 E820Configuration,
216 #[error("failed to enable singlestep execution: {0}")]
217 EnableSinglestep(base::Error),
218 #[error("failed to enable split irqchip: {0}")]
219 EnableSplitIrqchip(base::Error),
220 #[error("failed to get serial cmdline: {0}")]
221 GetSerialCmdline(GetSerialCmdlineError),
222 #[error("failed to insert device onto bus: {0}")]
223 InsertBus(devices::BusError),
224 #[error("the kernel extends past the end of RAM")]
225 InvalidCpuConfig,
226 #[error("invalid CPU config parameters")]
227 KernelOffsetPastEnd,
228 #[error("error loading bios: {0}")]
229 LoadBios(io::Error),
230 #[error("error loading kernel bzImage: {0}")]
231 LoadBzImage(bzimage::Error),
232 #[error("error loading command line: {0}")]
233 LoadCmdline(kernel_loader::Error),
234 #[error("error loading initrd: {0}")]
235 LoadInitrd(arch::LoadImageError),
236 #[error("error loading Kernel: {0}")]
237 LoadKernel(kernel_loader::Error),
238 #[error("error loading pflash: {0}")]
239 LoadPflash(io::Error),
240 #[error("error translating address: Page not present")]
241 PageNotPresent,
242 #[error("error reading guest memory {0}")]
243 ReadingGuestMemory(vm_memory::GuestMemoryError),
244 #[error("single register read not supported on x86_64")]
245 ReadRegIsUnsupported,
246 #[error("error reading CPU registers {0}")]
247 ReadRegs(base::Error),
248 #[error("error registering an IrqFd: {0}")]
249 RegisterIrqfd(base::Error),
250 #[error("error registering virtual socket device: {0}")]
251 RegisterVsock(arch::DeviceRegistrationError),
252 #[error("error reserved pcie config mmio")]
253 ReservePcieCfgMmio(resources::Error),
254 #[error("failed to set a hardware breakpoint: {0}")]
255 SetHwBreakpoint(base::Error),
256 #[error("failed to set identity map addr: {0}")]
257 SetIdentityMapAddr(base::Error),
258 #[error("failed to set interrupts: {0}")]
259 SetLint(interrupts::Error),
260 #[error("failed to set tss addr: {0}")]
261 SetTssAddr(base::Error),
262 #[error("failed to set up cmos: {0}")]
263 SetupCmos(anyhow::Error),
264 #[error("failed to set up cpuid: {0}")]
265 SetupCpuid(cpuid::Error),
266 #[error("setup data too large")]
267 SetupDataTooLarge,
268 #[error("failed to set up FPU: {0}")]
269 SetupFpu(base::Error),
270 #[error("failed to set up guest memory: {0}")]
271 SetupGuestMemory(GuestMemoryError),
272 #[error("failed to set up mptable: {0}")]
273 SetupMptable(mptable::Error),
274 #[error("failed to set up MSRs: {0}")]
275 SetupMsrs(base::Error),
276 #[error("failed to set up page tables: {0}")]
277 SetupPageTables(regs::Error),
278 #[error("failed to set up pflash: {0}")]
279 SetupPflash(anyhow::Error),
280 #[error("failed to set up registers: {0}")]
281 SetupRegs(regs::Error),
282 #[error("failed to set up SMBIOS: {0}")]
283 SetupSmbios(smbios::Error),
284 #[error("failed to set up sregs: {0}")]
285 SetupSregs(base::Error),
286 #[error("failed to translate virtual address")]
287 TranslatingVirtAddr,
288 #[error("protected VMs not supported on x86_64")]
289 UnsupportedProtectionType,
290 #[error("single register write not supported on x86_64")]
291 WriteRegIsUnsupported,
292 #[error("error writing CPU registers {0}")]
293 WriteRegs(base::Error),
294 #[error("error writing guest memory {0}")]
295 WritingGuestMemory(GuestMemoryError),
296 #[error("error writing setup_data: {0}")]
297 WritingSetupData(GuestMemoryError),
298 #[error("the zero page extends past the end of guest_mem")]
299 ZeroPagePastRamEnd,
300 #[error("error writing the zero page of guest memory")]
301 ZeroPageSetup,
302 }
303
304 pub type Result<T> = std::result::Result<T, Error>;
305
306 pub struct X8664arch;
307
308 // Like `bootparam::setup_data` without the incomplete array field at the end, which allows us to
309 // safely implement Copy, Clone
310 #[repr(C)]
311 #[derive(Copy, Clone, Default, FromZeroes, FromBytes, AsBytes)]
312 struct setup_data_hdr {
313 pub next: u64,
314 pub type_: u32,
315 pub len: u32,
316 }
317
318 #[repr(u32)]
319 #[derive(Copy, Clone, Debug, PartialEq, Eq)]
320 pub enum SetupDataType {
321 Dtb = SETUP_DTB,
322 RngSeed = SETUP_RNG_SEED,
323 }
324
325 /// A single entry to be inserted in the bootparam `setup_data` linked list.
326 pub struct SetupData {
327 pub data: Vec<u8>,
328 pub type_: SetupDataType,
329 }
330
331 enum E820Type {
332 Ram = 0x01,
333 Reserved = 0x2,
334 }
335
336 const MB: u64 = 1 << 20;
337 const GB: u64 = 1 << 30;
338
339 pub const BOOT_STACK_POINTER: u64 = 0x8000;
340 const START_OF_RAM_32BITS: u64 = 0;
341 const FIRST_ADDR_PAST_32BITS: u64 = 1 << 32;
342 // Linux (with 4-level paging) has a physical memory limit of 46 bits (64 TiB).
343 const HIGH_MMIO_MAX_END: u64 = (1u64 << 46) - 1;
344 pub const KERNEL_64BIT_ENTRY_OFFSET: u64 = 0x200;
345 pub const ZERO_PAGE_OFFSET: u64 = 0x7000;
346 // Set BIOS max size to 16M: this is used only when `unrestricted guest` is disabled
347 const BIOS_MAX_SIZE: u64 = 0x1000000;
348
349 pub const KERNEL_START_OFFSET: u64 = 0x20_0000;
350 const CMDLINE_OFFSET: u64 = 0x2_0000;
351 const CMDLINE_MAX_SIZE: u64 = 0x800; // including terminating zero
352 const SETUP_DATA_START: u64 = CMDLINE_OFFSET + CMDLINE_MAX_SIZE;
353 const SETUP_DATA_END: u64 = ACPI_HI_RSDP_WINDOW_BASE;
354 const X86_64_SERIAL_1_3_IRQ: u32 = 4;
355 const X86_64_SERIAL_2_4_IRQ: u32 = 3;
356 // X86_64_SCI_IRQ is used to fill the ACPI FACP table.
357 // The sci_irq number is better to be a legacy
358 // IRQ number which is less than 16(actually most of the
359 // platforms have fixed IRQ number 9). So we can
360 // reserve the IRQ number 5 for SCI and let the
361 // the other devices starts from next.
362 pub const X86_64_SCI_IRQ: u32 = 5;
363 // The CMOS RTC uses IRQ 8; start allocating IRQs at 9.
364 pub const X86_64_IRQ_BASE: u32 = 9;
365 const ACPI_HI_RSDP_WINDOW_BASE: u64 = 0x000E_0000;
366
367 #[derive(Debug, PartialEq, Eq)]
368 pub enum CpuManufacturer {
369 Intel,
370 Amd,
371 Unknown,
372 }
373
get_cpu_manufacturer() -> CpuManufacturer374 pub fn get_cpu_manufacturer() -> CpuManufacturer {
375 cpuid::cpu_manufacturer()
376 }
377
378 // Memory layout below 4G
379 struct LowMemoryLayout {
380 // the pci mmio range below 4G
381 pci_mmio: AddressRange,
382 // the pcie cfg mmio range
383 pcie_cfg_mmio: AddressRange,
384 }
385
386 static LOW_MEMORY_LAYOUT: OnceCell<LowMemoryLayout> = OnceCell::new();
387
init_low_memory_layout(pcie_ecam: Option<AddressRange>, pci_low_start: Option<u64>)388 pub fn init_low_memory_layout(pcie_ecam: Option<AddressRange>, pci_low_start: Option<u64>) {
389 LOW_MEMORY_LAYOUT.get_or_init(|| {
390 // Make sure it align to 256MB for MTRR convenient
391 const MEM_32BIT_GAP_SIZE: u64 = 768 * MB;
392 // Reserved memory for nand_bios/LAPIC/IOAPIC/HPET/.....
393 const RESERVED_MEM_SIZE: u64 = 0x800_0000;
394 const PCI_MMIO_END: u64 = FIRST_ADDR_PAST_32BITS - RESERVED_MEM_SIZE - 1;
395 // Reserve 64MB for pcie enhanced configuration
396 const DEFAULT_PCIE_CFG_MMIO_SIZE: u64 = 0x400_0000;
397 const DEFAULT_PCIE_CFG_MMIO_END: u64 = FIRST_ADDR_PAST_32BITS - RESERVED_MEM_SIZE - 1;
398 const DEFAULT_PCIE_CFG_MMIO_START: u64 =
399 DEFAULT_PCIE_CFG_MMIO_END - DEFAULT_PCIE_CFG_MMIO_SIZE + 1;
400 const DEFAULT_PCIE_CFG_MMIO: AddressRange = AddressRange {
401 start: DEFAULT_PCIE_CFG_MMIO_START,
402 end: DEFAULT_PCIE_CFG_MMIO_END,
403 };
404
405 let pcie_cfg_mmio = pcie_ecam.unwrap_or(DEFAULT_PCIE_CFG_MMIO);
406
407 let pci_mmio = if let Some(pci_low) = pci_low_start {
408 AddressRange {
409 start: pci_low,
410 end: PCI_MMIO_END,
411 }
412 } else {
413 AddressRange {
414 start: pcie_cfg_mmio
415 .start
416 .min(FIRST_ADDR_PAST_32BITS - MEM_32BIT_GAP_SIZE),
417 end: PCI_MMIO_END,
418 }
419 };
420
421 LowMemoryLayout {
422 pci_mmio,
423 pcie_cfg_mmio,
424 }
425 });
426 }
427
read_pci_mmio_before_32bit() -> AddressRange428 pub fn read_pci_mmio_before_32bit() -> AddressRange {
429 LOW_MEMORY_LAYOUT.get().unwrap().pci_mmio
430 }
read_pcie_cfg_mmio() -> AddressRange431 pub fn read_pcie_cfg_mmio() -> AddressRange {
432 LOW_MEMORY_LAYOUT.get().unwrap().pcie_cfg_mmio
433 }
434
435 /// The x86 reset vector for i386+ and x86_64 puts the processor into an "unreal mode" where it
436 /// can access the last 1 MB of the 32-bit address space in 16-bit mode, and starts the instruction
437 /// pointer at the effective physical address 0xFFFF_FFF0.
bios_start(bios_size: u64) -> GuestAddress438 fn bios_start(bios_size: u64) -> GuestAddress {
439 GuestAddress(FIRST_ADDR_PAST_32BITS - bios_size)
440 }
441
identity_map_addr_start() -> GuestAddress442 fn identity_map_addr_start() -> GuestAddress {
443 // Set Identity map address 4 pages before the max BIOS size
444 GuestAddress(FIRST_ADDR_PAST_32BITS - BIOS_MAX_SIZE - 4 * 0x1000)
445 }
446
tss_addr_start() -> GuestAddress447 fn tss_addr_start() -> GuestAddress {
448 // Set TSS address one page after identity map address
449 GuestAddress(identity_map_addr_start().offset() + 0x1000)
450 }
451
tss_addr_end() -> GuestAddress452 fn tss_addr_end() -> GuestAddress {
453 // Set TSS address section to have 3 pages
454 GuestAddress(tss_addr_start().offset() + 0x3000)
455 }
456
configure_system( guest_mem: &GuestMemory, kernel_addr: GuestAddress, cmdline_addr: GuestAddress, cmdline_size: usize, setup_data: Option<GuestAddress>, initrd: Option<(GuestAddress, usize)>, mut params: boot_params, ) -> Result<()>457 fn configure_system(
458 guest_mem: &GuestMemory,
459 kernel_addr: GuestAddress,
460 cmdline_addr: GuestAddress,
461 cmdline_size: usize,
462 setup_data: Option<GuestAddress>,
463 initrd: Option<(GuestAddress, usize)>,
464 mut params: boot_params,
465 ) -> Result<()> {
466 const EBDA_START: u64 = 0x0009_fc00;
467 const KERNEL_BOOT_FLAG_MAGIC: u16 = 0xaa55;
468 const KERNEL_HDR_MAGIC: u32 = 0x5372_6448;
469 const KERNEL_LOADER_OTHER: u8 = 0xff;
470 const KERNEL_MIN_ALIGNMENT_BYTES: u32 = 0x100_0000; // Must be non-zero.
471
472 params.hdr.type_of_loader = KERNEL_LOADER_OTHER;
473 params.hdr.boot_flag = KERNEL_BOOT_FLAG_MAGIC;
474 params.hdr.header = KERNEL_HDR_MAGIC;
475 params.hdr.cmd_line_ptr = cmdline_addr.offset() as u32;
476 params.ext_cmd_line_ptr = (cmdline_addr.offset() >> 32) as u32;
477 params.hdr.cmdline_size = cmdline_size as u32;
478 params.hdr.kernel_alignment = KERNEL_MIN_ALIGNMENT_BYTES;
479 if let Some(setup_data) = setup_data {
480 params.hdr.setup_data = setup_data.offset();
481 }
482 if let Some((initrd_addr, initrd_size)) = initrd {
483 params.hdr.ramdisk_image = initrd_addr.offset() as u32;
484 params.hdr.ramdisk_size = initrd_size as u32;
485 }
486
487 add_e820_entry(
488 &mut params,
489 AddressRange {
490 start: START_OF_RAM_32BITS,
491 end: EBDA_START - 1,
492 },
493 E820Type::Ram,
494 )?;
495
496 // GuestMemory::end_addr() returns the first address past the end, so subtract 1 to get the
497 // inclusive end.
498 let guest_mem_end = guest_mem.end_addr().offset() - 1;
499 let ram_below_4g = AddressRange {
500 start: kernel_addr.offset(),
501 end: guest_mem_end.min(read_pci_mmio_before_32bit().start - 1),
502 };
503 let ram_above_4g = AddressRange {
504 start: FIRST_ADDR_PAST_32BITS,
505 end: guest_mem_end,
506 };
507 add_e820_entry(&mut params, ram_below_4g, E820Type::Ram)?;
508 if !ram_above_4g.is_empty() {
509 add_e820_entry(&mut params, ram_above_4g, E820Type::Ram)?
510 }
511
512 let pcie_cfg_mmio_range = read_pcie_cfg_mmio();
513 add_e820_entry(&mut params, pcie_cfg_mmio_range, E820Type::Reserved)?;
514
515 add_e820_entry(
516 &mut params,
517 X8664arch::get_pcie_vcfg_mmio_range(guest_mem, &pcie_cfg_mmio_range),
518 E820Type::Reserved,
519 )?;
520
521 // Reserve memory section for Identity map and TSS
522 add_e820_entry(
523 &mut params,
524 AddressRange {
525 start: identity_map_addr_start().offset(),
526 end: tss_addr_end().offset() - 1,
527 },
528 E820Type::Reserved,
529 )?;
530
531 let zero_page_addr = GuestAddress(ZERO_PAGE_OFFSET);
532 if !guest_mem.is_valid_range(zero_page_addr, mem::size_of::<boot_params>() as u64) {
533 return Err(Error::ZeroPagePastRamEnd);
534 }
535
536 guest_mem
537 .write_obj_at_addr(params, zero_page_addr)
538 .map_err(|_| Error::ZeroPageSetup)?;
539
540 Ok(())
541 }
542
543 /// Write setup_data entries in guest memory and link them together with the `next` field.
544 ///
545 /// Returns the guest address of the first entry in the setup_data list, if any.
write_setup_data( guest_mem: &GuestMemory, setup_data_start: GuestAddress, setup_data_end: GuestAddress, setup_data: &[SetupData], ) -> Result<Option<GuestAddress>>546 fn write_setup_data(
547 guest_mem: &GuestMemory,
548 setup_data_start: GuestAddress,
549 setup_data_end: GuestAddress,
550 setup_data: &[SetupData],
551 ) -> Result<Option<GuestAddress>> {
552 let mut setup_data_list_head = None;
553
554 // Place the first setup_data at the first 64-bit aligned offset following setup_data_start.
555 let mut setup_data_addr = setup_data_start.align(8).ok_or(Error::SetupDataTooLarge)?;
556
557 let mut entry_iter = setup_data.iter().peekable();
558 while let Some(entry) = entry_iter.next() {
559 if setup_data_list_head.is_none() {
560 setup_data_list_head = Some(setup_data_addr);
561 }
562
563 // Ensure the entry (header plus data) fits into guest memory.
564 let entry_size = (mem::size_of::<setup_data_hdr>() + entry.data.len()) as u64;
565 let entry_end = setup_data_addr
566 .checked_add(entry_size)
567 .ok_or(Error::SetupDataTooLarge)?;
568
569 if entry_end >= setup_data_end {
570 return Err(Error::SetupDataTooLarge);
571 }
572
573 let next_setup_data_addr = if entry_iter.peek().is_some() {
574 // Place the next setup_data at a 64-bit aligned address.
575 setup_data_addr
576 .checked_add(entry_size)
577 .and_then(|addr| addr.align(8))
578 .ok_or(Error::SetupDataTooLarge)?
579 } else {
580 // This is the final entry. Terminate the list with next == 0.
581 GuestAddress(0)
582 };
583
584 let hdr = setup_data_hdr {
585 next: next_setup_data_addr.offset(),
586 type_: entry.type_ as u32,
587 len: entry
588 .data
589 .len()
590 .try_into()
591 .map_err(|_| Error::SetupDataTooLarge)?,
592 };
593
594 guest_mem
595 .write_obj_at_addr(hdr, setup_data_addr)
596 .map_err(Error::WritingSetupData)?;
597 guest_mem
598 .write_all_at_addr(
599 &entry.data,
600 setup_data_addr.unchecked_add(mem::size_of::<setup_data_hdr>() as u64),
601 )
602 .map_err(Error::WritingSetupData)?;
603
604 setup_data_addr = next_setup_data_addr;
605 }
606
607 Ok(setup_data_list_head)
608 }
609
610 /// Generate a SETUP_RNG_SEED SetupData with random seed data.
setup_data_rng_seed() -> SetupData611 fn setup_data_rng_seed() -> SetupData {
612 let mut data = vec![0u8; 256];
613 OsRng.fill_bytes(&mut data);
614 SetupData {
615 data,
616 type_: SetupDataType::RngSeed,
617 }
618 }
619
620 /// Add an e820 region to the e820 map.
621 /// Returns Ok(()) if successful, or an error if there is no space left in the map.
add_e820_entry(params: &mut boot_params, range: AddressRange, mem_type: E820Type) -> Result<()>622 fn add_e820_entry(params: &mut boot_params, range: AddressRange, mem_type: E820Type) -> Result<()> {
623 if params.e820_entries >= params.e820_table.len() as u8 {
624 return Err(Error::E820Configuration);
625 }
626
627 let size = range.len().ok_or(Error::E820Configuration)?;
628
629 params.e820_table[params.e820_entries as usize].addr = range.start;
630 params.e820_table[params.e820_entries as usize].size = size;
631 params.e820_table[params.e820_entries as usize].type_ = mem_type as u32;
632 params.e820_entries += 1;
633
634 Ok(())
635 }
636
637 /// Returns a Vec of the valid memory addresses.
638 /// These should be used to configure the GuestMemory structure for the platform.
639 /// For x86_64 all addresses are valid from the start of the kernel except a
640 /// carve out at the end of 32bit address space.
arch_memory_regions( size: u64, bios_size: Option<u64>, ) -> Vec<(GuestAddress, u64, MemoryRegionOptions)>641 pub fn arch_memory_regions(
642 size: u64,
643 bios_size: Option<u64>,
644 ) -> Vec<(GuestAddress, u64, MemoryRegionOptions)> {
645 let mem_start = START_OF_RAM_32BITS;
646 let mem_end = GuestAddress(size + mem_start);
647
648 let first_addr_past_32bits = GuestAddress(FIRST_ADDR_PAST_32BITS);
649 let end_32bit_gap_start = GuestAddress(read_pci_mmio_before_32bit().start);
650
651 let mut regions = Vec::new();
652 if mem_end <= end_32bit_gap_start {
653 regions.push((GuestAddress(mem_start), size, Default::default()));
654 if let Some(bios_size) = bios_size {
655 regions.push((bios_start(bios_size), bios_size, Default::default()));
656 }
657 } else {
658 regions.push((
659 GuestAddress(mem_start),
660 end_32bit_gap_start.offset() - mem_start,
661 Default::default(),
662 ));
663 if let Some(bios_size) = bios_size {
664 regions.push((bios_start(bios_size), bios_size, Default::default()));
665 }
666 regions.push((
667 first_addr_past_32bits,
668 mem_end.offset_from(end_32bit_gap_start),
669 Default::default(),
670 ));
671 }
672
673 regions
674 }
675
676 impl arch::LinuxArch for X8664arch {
677 type Error = Error;
678
guest_memory_layout( components: &VmComponents, _hypervisor: &impl Hypervisor, ) -> std::result::Result<Vec<(GuestAddress, u64, MemoryRegionOptions)>, Self::Error>679 fn guest_memory_layout(
680 components: &VmComponents,
681 _hypervisor: &impl Hypervisor,
682 ) -> std::result::Result<Vec<(GuestAddress, u64, MemoryRegionOptions)>, Self::Error> {
683 init_low_memory_layout(components.pcie_ecam, components.pci_low_start);
684
685 let bios_size = match &components.vm_image {
686 VmImage::Bios(bios_file) => Some(bios_file.metadata().map_err(Error::LoadBios)?.len()),
687 VmImage::Kernel(_) => None,
688 };
689
690 Ok(arch_memory_regions(components.memory_size, bios_size))
691 }
692
get_system_allocator_config<V: Vm>(vm: &V) -> SystemAllocatorConfig693 fn get_system_allocator_config<V: Vm>(vm: &V) -> SystemAllocatorConfig {
694 SystemAllocatorConfig {
695 io: Some(AddressRange {
696 start: 0xc000,
697 end: 0xffff,
698 }),
699 low_mmio: read_pci_mmio_before_32bit(),
700 high_mmio: Self::get_high_mmio_range(vm),
701 platform_mmio: None,
702 first_irq: X86_64_IRQ_BASE,
703 }
704 }
705
build_vm<V, Vcpu>( mut components: VmComponents, vm_evt_wrtube: &SendTube, system_allocator: &mut SystemAllocator, serial_parameters: &BTreeMap<(SerialHardware, u8), SerialParameters>, serial_jail: Option<Minijail>, battery: (Option<BatteryType>, Option<Minijail>), mut vm: V, ramoops_region: Option<arch::pstore::RamoopsRegion>, devs: Vec<(Box<dyn BusDeviceObj>, Option<Minijail>)>, irq_chip: &mut dyn IrqChipX86_64, vcpu_ids: &mut Vec<usize>, dump_device_tree_blob: Option<PathBuf>, debugcon_jail: Option<Minijail>, pflash_jail: Option<Minijail>, fw_cfg_jail: Option<Minijail>, #[cfg(feature = "swap")] swap_controller: &mut Option<swap::SwapController>, #[cfg(any(target_os = "android", target_os = "linux"))] guest_suspended_cvar: Option< Arc<(Mutex<bool>, Condvar)>, >, device_tree_overlays: Vec<DtbOverlay>, ) -> std::result::Result<RunnableLinuxVm<V, Vcpu>, Self::Error> where V: VmX86_64, Vcpu: VcpuX86_64,706 fn build_vm<V, Vcpu>(
707 mut components: VmComponents,
708 vm_evt_wrtube: &SendTube,
709 system_allocator: &mut SystemAllocator,
710 serial_parameters: &BTreeMap<(SerialHardware, u8), SerialParameters>,
711 serial_jail: Option<Minijail>,
712 battery: (Option<BatteryType>, Option<Minijail>),
713 mut vm: V,
714 ramoops_region: Option<arch::pstore::RamoopsRegion>,
715 devs: Vec<(Box<dyn BusDeviceObj>, Option<Minijail>)>,
716 irq_chip: &mut dyn IrqChipX86_64,
717 vcpu_ids: &mut Vec<usize>,
718 dump_device_tree_blob: Option<PathBuf>,
719 debugcon_jail: Option<Minijail>,
720 pflash_jail: Option<Minijail>,
721 fw_cfg_jail: Option<Minijail>,
722 #[cfg(feature = "swap")] swap_controller: &mut Option<swap::SwapController>,
723 #[cfg(any(target_os = "android", target_os = "linux"))] guest_suspended_cvar: Option<
724 Arc<(Mutex<bool>, Condvar)>,
725 >,
726 device_tree_overlays: Vec<DtbOverlay>,
727 ) -> std::result::Result<RunnableLinuxVm<V, Vcpu>, Self::Error>
728 where
729 V: VmX86_64,
730 Vcpu: VcpuX86_64,
731 {
732 if components.hv_cfg.protection_type != ProtectionType::Unprotected {
733 return Err(Error::UnsupportedProtectionType);
734 }
735
736 let mem = vm.get_memory().clone();
737
738 let vcpu_count = components.vcpu_count;
739
740 vm.set_identity_map_addr(identity_map_addr_start())
741 .map_err(Error::SetIdentityMapAddr)?;
742
743 vm.set_tss_addr(tss_addr_start())
744 .map_err(Error::SetTssAddr)?;
745
746 // Use IRQ info in ACPI if provided by the user.
747 let mut mptable = true;
748 let mut sci_irq = X86_64_SCI_IRQ;
749
750 // punch pcie config mmio from pci low mmio, so that it couldn't be
751 // allocated to any device.
752 let pcie_cfg_mmio_range = read_pcie_cfg_mmio();
753 system_allocator
754 .reserve_mmio(pcie_cfg_mmio_range)
755 .map_err(Error::ReservePcieCfgMmio)?;
756
757 for sdt in components.acpi_sdts.iter() {
758 if sdt.is_signature(b"FACP") {
759 mptable = false;
760 let sci_irq_fadt: u16 = sdt.read(acpi::FADT_FIELD_SCI_INTERRUPT);
761 sci_irq = sci_irq_fadt.into();
762 if !system_allocator.reserve_irq(sci_irq) {
763 warn!("sci irq {} already reserved.", sci_irq);
764 }
765 }
766 }
767
768 let pcie_vcfg_range = Self::get_pcie_vcfg_mmio_range(&mem, &pcie_cfg_mmio_range);
769 let mmio_bus = Arc::new(Bus::new(BusType::Mmio));
770 let io_bus = Arc::new(Bus::new(BusType::Io));
771
772 let (pci_devices, devs): (Vec<_>, Vec<_>) = devs
773 .into_iter()
774 .partition(|(dev, _)| dev.as_pci_device().is_some());
775
776 let pci_devices = pci_devices
777 .into_iter()
778 .map(|(dev, jail_orig)| (dev.into_pci_device().unwrap(), jail_orig))
779 .collect();
780
781 let (pci, pci_irqs, mut pid_debug_label_map, amls, gpe_scope_amls) =
782 arch::generate_pci_root(
783 pci_devices,
784 irq_chip.as_irq_chip_mut(),
785 mmio_bus.clone(),
786 GuestAddress(pcie_cfg_mmio_range.start),
787 12,
788 io_bus.clone(),
789 system_allocator,
790 &mut vm,
791 4, // Share the four pin interrupts (INTx#)
792 Some(pcie_vcfg_range.start),
793 #[cfg(feature = "swap")]
794 swap_controller,
795 )
796 .map_err(Error::CreatePciRoot)?;
797
798 let pci = Arc::new(Mutex::new(pci));
799 pci.lock().enable_pcie_cfg_mmio(pcie_cfg_mmio_range.start);
800 let pci_cfg = PciConfigIo::new(
801 pci.clone(),
802 components.break_linux_pci_config_io,
803 vm_evt_wrtube.try_clone().map_err(Error::CloneTube)?,
804 );
805 let pci_bus = Arc::new(Mutex::new(pci_cfg));
806 io_bus.insert(pci_bus, 0xcf8, 0x8).unwrap();
807
808 let pcie_cfg_mmio = Arc::new(Mutex::new(PciConfigMmio::new(pci.clone(), 12)));
809 let pcie_cfg_mmio_len = pcie_cfg_mmio_range.len().unwrap();
810 mmio_bus
811 .insert(pcie_cfg_mmio, pcie_cfg_mmio_range.start, pcie_cfg_mmio_len)
812 .unwrap();
813
814 let pcie_vcfg_mmio = Arc::new(Mutex::new(PciVirtualConfigMmio::new(pci.clone(), 13)));
815 mmio_bus
816 .insert(
817 pcie_vcfg_mmio,
818 pcie_vcfg_range.start,
819 pcie_vcfg_range.len().unwrap(),
820 )
821 .unwrap();
822
823 let (virtio_mmio_devices, _others): (Vec<_>, Vec<_>) = devs
824 .into_iter()
825 .partition(|(dev, _)| dev.as_virtio_mmio_device().is_some());
826
827 let virtio_mmio_devices = virtio_mmio_devices
828 .into_iter()
829 .map(|(dev, jail_orig)| (*(dev.into_virtio_mmio_device().unwrap()), jail_orig))
830 .collect();
831 let (mut virtio_mmio_pid, sdts) = arch::generate_virtio_mmio_bus(
832 virtio_mmio_devices,
833 irq_chip.as_irq_chip_mut(),
834 &mmio_bus,
835 system_allocator,
836 &mut vm,
837 components.acpi_sdts,
838 #[cfg(feature = "swap")]
839 swap_controller,
840 )
841 .map_err(Error::CreateVirtioMmioBus)?;
842 components.acpi_sdts = sdts;
843 pid_debug_label_map.append(&mut virtio_mmio_pid);
844
845 // Event used to notify crosvm that guest OS is trying to suspend.
846 let suspend_evt = Event::new().map_err(Error::CreateEvent)?;
847
848 if components.fw_cfg_enable {
849 Self::setup_fw_cfg_device(
850 &io_bus,
851 components.fw_cfg_parameters.clone(),
852 components.bootorder_fw_cfg_blob.clone(),
853 fw_cfg_jail,
854 #[cfg(feature = "swap")]
855 swap_controller,
856 )?;
857 }
858
859 if !components.no_i8042 {
860 Self::setup_legacy_i8042_device(
861 &io_bus,
862 irq_chip.pit_uses_speaker_port(),
863 vm_evt_wrtube.try_clone().map_err(Error::CloneTube)?,
864 )?;
865 }
866 let vm_request_tube = if !components.no_rtc {
867 let (host_tube, device_tube) = Tube::pair()
868 .context("create tube")
869 .map_err(Error::SetupCmos)?;
870 Self::setup_legacy_cmos_device(&io_bus, irq_chip, device_tube, components.memory_size)
871 .map_err(Error::SetupCmos)?;
872 Some(host_tube)
873 } else {
874 None
875 };
876 let serial_devices = Self::setup_serial_devices(
877 components.hv_cfg.protection_type,
878 irq_chip.as_irq_chip_mut(),
879 &io_bus,
880 serial_parameters,
881 serial_jail,
882 #[cfg(feature = "swap")]
883 swap_controller,
884 )?;
885 Self::setup_debugcon_devices(
886 components.hv_cfg.protection_type,
887 &io_bus,
888 serial_parameters,
889 debugcon_jail,
890 #[cfg(feature = "swap")]
891 swap_controller,
892 )?;
893
894 let bios_size = if let VmImage::Bios(ref bios) = components.vm_image {
895 bios.metadata().map_err(Error::LoadBios)?.len()
896 } else {
897 0
898 };
899 if let Some(pflash_image) = components.pflash_image {
900 Self::setup_pflash(
901 pflash_image,
902 components.pflash_block_size,
903 bios_size,
904 &mmio_bus,
905 pflash_jail,
906 #[cfg(feature = "swap")]
907 swap_controller,
908 )?;
909 }
910
911 // Functions that use/create jails MUST be used before the call to
912 // setup_acpi_devices below, as this move us into a multiprocessing state
913 // from which we can no longer fork.
914
915 let mut resume_notify_devices = Vec::new();
916
917 // each bus occupy 1MB mmio for pcie enhanced configuration
918 let max_bus = (pcie_cfg_mmio_len / 0x100000 - 1) as u8;
919 let (mut acpi_dev_resource, bat_control) = Self::setup_acpi_devices(
920 pci.clone(),
921 &mem,
922 &io_bus,
923 system_allocator,
924 suspend_evt.try_clone().map_err(Error::CloneEvent)?,
925 vm_evt_wrtube.try_clone().map_err(Error::CloneTube)?,
926 components.acpi_sdts,
927 irq_chip.as_irq_chip_mut(),
928 sci_irq,
929 battery,
930 &mmio_bus,
931 max_bus,
932 &mut resume_notify_devices,
933 #[cfg(feature = "swap")]
934 swap_controller,
935 #[cfg(any(target_os = "android", target_os = "linux"))]
936 components.ac_adapter,
937 #[cfg(any(target_os = "android", target_os = "linux"))]
938 guest_suspended_cvar,
939 &pci_irqs,
940 )?;
941
942 // Create customized SSDT table
943 let sdt = acpi::create_customize_ssdt(pci.clone(), amls, gpe_scope_amls);
944 if let Some(sdt) = sdt {
945 acpi_dev_resource.sdts.push(sdt);
946 }
947
948 irq_chip
949 .finalize_devices(system_allocator, &io_bus, &mmio_bus)
950 .map_err(Error::RegisterIrqfd)?;
951
952 // All of these bios generated tables are set manually for the benefit of the kernel boot
953 // flow (since there's no BIOS to set it) and for the BIOS boot flow since crosvm doesn't
954 // have a way to pass the BIOS these configs.
955 // This works right now because the only guest BIOS used with crosvm (u-boot) ignores these
956 // tables and the guest OS picks them up.
957 // If another guest does need a way to pass these tables down to it's BIOS, this approach
958 // should be rethought.
959
960 if mptable {
961 // Note that this puts the mptable at 0x9FC00 in guest physical memory.
962 mptable::setup_mptable(&mem, vcpu_count as u8, &pci_irqs)
963 .map_err(Error::SetupMptable)?;
964 }
965 smbios::setup_smbios(&mem, &components.smbios, bios_size).map_err(Error::SetupSmbios)?;
966
967 let host_cpus = if components.host_cpu_topology {
968 components.vcpu_affinity.clone()
969 } else {
970 None
971 };
972
973 // TODO (tjeznach) Write RSDP to bootconfig before writing to memory
974 acpi::create_acpi_tables(
975 &mem,
976 vcpu_count as u8,
977 sci_irq,
978 0xcf9,
979 6, // RST_CPU|SYS_RST
980 &acpi_dev_resource,
981 host_cpus,
982 vcpu_ids,
983 &pci_irqs,
984 pcie_cfg_mmio_range.start,
985 max_bus,
986 components.force_s2idle,
987 )
988 .ok_or(Error::CreateAcpi)?;
989
990 let mut cmdline = Self::get_base_linux_cmdline();
991
992 get_serial_cmdline(&mut cmdline, serial_parameters, "io", &serial_devices)
993 .map_err(Error::GetSerialCmdline)?;
994
995 for param in components.extra_kernel_params {
996 cmdline.insert_str(¶m).map_err(Error::Cmdline)?;
997 }
998
999 if let Some(ramoops_region) = ramoops_region {
1000 arch::pstore::add_ramoops_kernel_cmdline(&mut cmdline, &ramoops_region)
1001 .map_err(Error::Cmdline)?;
1002 }
1003
1004 let pci_start = read_pci_mmio_before_32bit().start;
1005
1006 let mut vcpu_init = vec![VcpuInitX86_64::default(); vcpu_count];
1007 let mut msrs = BTreeMap::new();
1008
1009 match components.vm_image {
1010 VmImage::Bios(ref mut bios) => {
1011 // Allow a bios to hardcode CMDLINE_OFFSET and read the kernel command line from it.
1012 kernel_loader::load_cmdline(
1013 &mem,
1014 GuestAddress(CMDLINE_OFFSET),
1015 &CString::new(cmdline).unwrap(),
1016 )
1017 .map_err(Error::LoadCmdline)?;
1018 Self::load_bios(&mem, bios)?;
1019 regs::set_default_msrs(&mut msrs);
1020 // The default values for `Regs` and `Sregs` already set up the reset vector.
1021 }
1022 VmImage::Kernel(ref mut kernel_image) => {
1023 let (params, kernel_end, kernel_entry) = Self::load_kernel(&mem, kernel_image)?;
1024
1025 Self::setup_system_memory(
1026 &mem,
1027 &CString::new(cmdline).unwrap(),
1028 components.initrd_image,
1029 components.android_fstab,
1030 kernel_end,
1031 params,
1032 dump_device_tree_blob,
1033 device_tree_overlays,
1034 )?;
1035
1036 // Configure the bootstrap VCPU for the Linux/x86 64-bit boot protocol.
1037 // <https://www.kernel.org/doc/html/latest/x86/boot.html>
1038 vcpu_init[0].regs.rip = kernel_entry.offset();
1039 vcpu_init[0].regs.rsp = BOOT_STACK_POINTER;
1040 vcpu_init[0].regs.rsi = ZERO_PAGE_OFFSET;
1041
1042 regs::set_long_mode_msrs(&mut msrs);
1043 regs::set_mtrr_msrs(&mut msrs, &vm, pci_start);
1044
1045 // Set up long mode and enable paging.
1046 regs::configure_segments_and_sregs(&mem, &mut vcpu_init[0].sregs)
1047 .map_err(Error::ConfigureSegments)?;
1048 regs::setup_page_tables(&mem, &mut vcpu_init[0].sregs)
1049 .map_err(Error::SetupPageTables)?;
1050 }
1051 }
1052
1053 // Initialize MSRs for all VCPUs.
1054 for vcpu in vcpu_init.iter_mut() {
1055 vcpu.msrs = msrs.clone();
1056 }
1057
1058 Ok(RunnableLinuxVm {
1059 vm,
1060 vcpu_count,
1061 vcpus: None,
1062 vcpu_affinity: components.vcpu_affinity,
1063 vcpu_init,
1064 no_smt: components.no_smt,
1065 irq_chip: irq_chip.try_box_clone().map_err(Error::CloneIrqChip)?,
1066 io_bus,
1067 mmio_bus,
1068 pid_debug_label_map,
1069 suspend_evt,
1070 resume_notify_devices,
1071 rt_cpus: components.rt_cpus,
1072 delay_rt: components.delay_rt,
1073 bat_control,
1074 #[cfg(feature = "gdb")]
1075 gdb: components.gdb,
1076 pm: Some(acpi_dev_resource.pm),
1077 root_config: pci,
1078 #[cfg(any(target_os = "android", target_os = "linux"))]
1079 platform_devices: Vec::new(),
1080 hotplug_bus: BTreeMap::new(),
1081 devices_thread: None,
1082 vm_request_tube,
1083 })
1084 }
1085
configure_vcpu<V: Vm>( vm: &V, hypervisor: &dyn HypervisorX86_64, irq_chip: &mut dyn IrqChipX86_64, vcpu: &mut dyn VcpuX86_64, vcpu_init: VcpuInitX86_64, vcpu_id: usize, num_cpus: usize, cpu_config: Option<CpuConfigX86_64>, ) -> Result<()>1086 fn configure_vcpu<V: Vm>(
1087 vm: &V,
1088 hypervisor: &dyn HypervisorX86_64,
1089 irq_chip: &mut dyn IrqChipX86_64,
1090 vcpu: &mut dyn VcpuX86_64,
1091 vcpu_init: VcpuInitX86_64,
1092 vcpu_id: usize,
1093 num_cpus: usize,
1094 cpu_config: Option<CpuConfigX86_64>,
1095 ) -> Result<()> {
1096 let cpu_config = match cpu_config {
1097 Some(config) => config,
1098 None => return Err(Error::InvalidCpuConfig),
1099 };
1100 if !vm.check_capability(VmCap::EarlyInitCpuid) {
1101 cpuid::setup_cpuid(hypervisor, irq_chip, vcpu, vcpu_id, num_cpus, cpu_config)
1102 .map_err(Error::SetupCpuid)?;
1103 }
1104
1105 vcpu.set_regs(&vcpu_init.regs).map_err(Error::WriteRegs)?;
1106
1107 vcpu.set_sregs(&vcpu_init.sregs)
1108 .map_err(Error::SetupSregs)?;
1109
1110 vcpu.set_fpu(&vcpu_init.fpu).map_err(Error::SetupFpu)?;
1111
1112 let vcpu_supported_var_mtrrs = regs::vcpu_supported_variable_mtrrs(vcpu);
1113 let num_var_mtrrs = regs::count_variable_mtrrs(&vcpu_init.msrs);
1114 let skip_mtrr_msrs = if num_var_mtrrs > vcpu_supported_var_mtrrs {
1115 warn!(
1116 "Too many variable MTRR entries ({} required, {} supported),
1117 please check pci_start addr, guest with pass through device may be very slow",
1118 num_var_mtrrs, vcpu_supported_var_mtrrs,
1119 );
1120 // Filter out the MTRR entries from the MSR list.
1121 true
1122 } else {
1123 false
1124 };
1125
1126 for (msr_index, value) in vcpu_init.msrs.into_iter() {
1127 if skip_mtrr_msrs && regs::is_mtrr_msr(msr_index) {
1128 continue;
1129 }
1130
1131 vcpu.set_msr(msr_index, value).map_err(Error::SetupMsrs)?;
1132 }
1133
1134 interrupts::set_lint(vcpu_id, irq_chip).map_err(Error::SetLint)?;
1135
1136 Ok(())
1137 }
1138
register_pci_device<V: VmX86_64, Vcpu: VcpuX86_64>( linux: &mut RunnableLinuxVm<V, Vcpu>, device: Box<dyn PciDevice>, #[cfg(any(target_os = "android", target_os = "linux"))] minijail: Option<Minijail>, resources: &mut SystemAllocator, hp_control_tube: &mpsc::Sender<PciRootCommand>, #[cfg(feature = "swap")] swap_controller: &mut Option<swap::SwapController>, ) -> Result<PciAddress>1139 fn register_pci_device<V: VmX86_64, Vcpu: VcpuX86_64>(
1140 linux: &mut RunnableLinuxVm<V, Vcpu>,
1141 device: Box<dyn PciDevice>,
1142 #[cfg(any(target_os = "android", target_os = "linux"))] minijail: Option<Minijail>,
1143 resources: &mut SystemAllocator,
1144 hp_control_tube: &mpsc::Sender<PciRootCommand>,
1145 #[cfg(feature = "swap")] swap_controller: &mut Option<swap::SwapController>,
1146 ) -> Result<PciAddress> {
1147 arch::configure_pci_device(
1148 linux,
1149 device,
1150 #[cfg(any(target_os = "android", target_os = "linux"))]
1151 minijail,
1152 resources,
1153 hp_control_tube,
1154 #[cfg(feature = "swap")]
1155 swap_controller,
1156 )
1157 .map_err(Error::ConfigurePciDevice)
1158 }
1159
get_host_cpu_frequencies_khz() -> Result<BTreeMap<usize, Vec<u32>>>1160 fn get_host_cpu_frequencies_khz() -> Result<BTreeMap<usize, Vec<u32>>> {
1161 Ok(BTreeMap::new())
1162 }
1163
get_host_cpu_capacity() -> Result<BTreeMap<usize, u32>>1164 fn get_host_cpu_capacity() -> Result<BTreeMap<usize, u32>> {
1165 Ok(BTreeMap::new())
1166 }
1167
get_host_cpu_clusters() -> Result<Vec<CpuSet>>1168 fn get_host_cpu_clusters() -> Result<Vec<CpuSet>> {
1169 Ok(Vec::new())
1170 }
1171 }
1172
1173 #[cfg(feature = "gdb")]
1174 impl<T: VcpuX86_64> arch::GdbOps<T> for X8664arch {
1175 type Error = Error;
1176
read_registers(vcpu: &T) -> Result<X86_64CoreRegs>1177 fn read_registers(vcpu: &T) -> Result<X86_64CoreRegs> {
1178 // General registers: RAX, RBX, RCX, RDX, RSI, RDI, RBP, RSP, r8-r15
1179 let gregs = vcpu.get_regs().map_err(Error::ReadRegs)?;
1180 let regs = [
1181 gregs.rax, gregs.rbx, gregs.rcx, gregs.rdx, gregs.rsi, gregs.rdi, gregs.rbp, gregs.rsp,
1182 gregs.r8, gregs.r9, gregs.r10, gregs.r11, gregs.r12, gregs.r13, gregs.r14, gregs.r15,
1183 ];
1184
1185 // GDB exposes 32-bit eflags instead of 64-bit rflags.
1186 // https://github.com/bminor/binutils-gdb/blob/master/gdb/features/i386/64bit-core.xml
1187 let eflags = gregs.rflags as u32;
1188 let rip = gregs.rip;
1189
1190 // Segment registers: CS, SS, DS, ES, FS, GS
1191 let sregs = vcpu.get_sregs().map_err(Error::ReadRegs)?;
1192 let segments = X86SegmentRegs {
1193 cs: sregs.cs.selector as u32,
1194 ss: sregs.ss.selector as u32,
1195 ds: sregs.ds.selector as u32,
1196 es: sregs.es.selector as u32,
1197 fs: sregs.fs.selector as u32,
1198 gs: sregs.gs.selector as u32,
1199 };
1200
1201 // x87 FPU internal state
1202 // TODO(dverkamp): floating point tag word, instruction pointer, and data pointer
1203 let fpu = vcpu.get_fpu().map_err(Error::ReadRegs)?;
1204 let fpu_internal = X87FpuInternalRegs {
1205 fctrl: u32::from(fpu.fcw),
1206 fstat: u32::from(fpu.fsw),
1207 fop: u32::from(fpu.last_opcode),
1208 ..Default::default()
1209 };
1210
1211 let mut regs = X86_64CoreRegs {
1212 regs,
1213 eflags,
1214 rip,
1215 segments,
1216 st: Default::default(),
1217 fpu: fpu_internal,
1218 xmm: Default::default(),
1219 mxcsr: fpu.mxcsr,
1220 };
1221
1222 // x87 FPU registers: ST0-ST7
1223 for (dst, src) in regs.st.iter_mut().zip(fpu.fpr.iter()) {
1224 // `fpr` contains the x87 floating point registers in FXSAVE format.
1225 // Each element contains an 80-bit floating point value in the low 10 bytes.
1226 // The upper 6 bytes are reserved and can be ignored.
1227 dst.copy_from_slice(&src[0..10])
1228 }
1229
1230 // SSE registers: XMM0-XMM15
1231 for (dst, src) in regs.xmm.iter_mut().zip(fpu.xmm.iter()) {
1232 *dst = u128::from_le_bytes(*src);
1233 }
1234
1235 Ok(regs)
1236 }
1237
write_registers(vcpu: &T, regs: &X86_64CoreRegs) -> Result<()>1238 fn write_registers(vcpu: &T, regs: &X86_64CoreRegs) -> Result<()> {
1239 // General purpose registers (RAX, RBX, RCX, RDX, RSI, RDI, RBP, RSP, r8-r15) + RIP + rflags
1240 let orig_gregs = vcpu.get_regs().map_err(Error::ReadRegs)?;
1241 let gregs = Regs {
1242 rax: regs.regs[0],
1243 rbx: regs.regs[1],
1244 rcx: regs.regs[2],
1245 rdx: regs.regs[3],
1246 rsi: regs.regs[4],
1247 rdi: regs.regs[5],
1248 rbp: regs.regs[6],
1249 rsp: regs.regs[7],
1250 r8: regs.regs[8],
1251 r9: regs.regs[9],
1252 r10: regs.regs[10],
1253 r11: regs.regs[11],
1254 r12: regs.regs[12],
1255 r13: regs.regs[13],
1256 r14: regs.regs[14],
1257 r15: regs.regs[15],
1258 rip: regs.rip,
1259 // Update the lower 32 bits of rflags.
1260 rflags: (orig_gregs.rflags & !(u32::MAX as u64)) | (regs.eflags as u64),
1261 };
1262 vcpu.set_regs(&gregs).map_err(Error::WriteRegs)?;
1263
1264 // Segment registers: CS, SS, DS, ES, FS, GS
1265 // Since GDB care only selectors, we call get_sregs() first.
1266 let mut sregs = vcpu.get_sregs().map_err(Error::ReadRegs)?;
1267 sregs.cs.selector = regs.segments.cs as u16;
1268 sregs.ss.selector = regs.segments.ss as u16;
1269 sregs.ds.selector = regs.segments.ds as u16;
1270 sregs.es.selector = regs.segments.es as u16;
1271 sregs.fs.selector = regs.segments.fs as u16;
1272 sregs.gs.selector = regs.segments.gs as u16;
1273
1274 vcpu.set_sregs(&sregs).map_err(Error::WriteRegs)?;
1275
1276 // FPU and SSE registers
1277 let mut fpu = vcpu.get_fpu().map_err(Error::ReadRegs)?;
1278 fpu.fcw = regs.fpu.fctrl as u16;
1279 fpu.fsw = regs.fpu.fstat as u16;
1280 fpu.last_opcode = regs.fpu.fop as u16;
1281 // TODO(dverkamp): floating point tag word, instruction pointer, and data pointer
1282
1283 // x87 FPU registers: ST0-ST7
1284 for (dst, src) in fpu.fpr.iter_mut().zip(regs.st.iter()) {
1285 dst[0..10].copy_from_slice(src);
1286 }
1287
1288 // SSE registers: XMM0-XMM15
1289 for (dst, src) in fpu.xmm.iter_mut().zip(regs.xmm.iter()) {
1290 dst.copy_from_slice(&src.to_le_bytes());
1291 }
1292
1293 vcpu.set_fpu(&fpu).map_err(Error::WriteRegs)?;
1294
1295 Ok(())
1296 }
1297
1298 #[inline]
read_register(_vcpu: &T, _reg: X86_64CoreRegId) -> Result<Vec<u8>>1299 fn read_register(_vcpu: &T, _reg: X86_64CoreRegId) -> Result<Vec<u8>> {
1300 Err(Error::ReadRegIsUnsupported)
1301 }
1302
1303 #[inline]
write_register(_vcpu: &T, _reg: X86_64CoreRegId, _buf: &[u8]) -> Result<()>1304 fn write_register(_vcpu: &T, _reg: X86_64CoreRegId, _buf: &[u8]) -> Result<()> {
1305 Err(Error::WriteRegIsUnsupported)
1306 }
1307
read_memory( vcpu: &T, guest_mem: &GuestMemory, vaddr: GuestAddress, len: usize, ) -> Result<Vec<u8>>1308 fn read_memory(
1309 vcpu: &T,
1310 guest_mem: &GuestMemory,
1311 vaddr: GuestAddress,
1312 len: usize,
1313 ) -> Result<Vec<u8>> {
1314 let sregs = vcpu.get_sregs().map_err(Error::ReadRegs)?;
1315 let mut buf = vec![0; len];
1316 let mut total_read = 0u64;
1317 // Handle reads across page boundaries.
1318
1319 while total_read < len as u64 {
1320 let (paddr, psize) = phys_addr(guest_mem, vaddr.0 + total_read, &sregs)?;
1321 let read_len = std::cmp::min(len as u64 - total_read, psize - (paddr & (psize - 1)));
1322 guest_mem
1323 .get_slice_at_addr(GuestAddress(paddr), read_len as usize)
1324 .map_err(Error::ReadingGuestMemory)?
1325 .copy_to(&mut buf[total_read as usize..]);
1326 total_read += read_len;
1327 }
1328 Ok(buf)
1329 }
1330
write_memory( vcpu: &T, guest_mem: &GuestMemory, vaddr: GuestAddress, buf: &[u8], ) -> Result<()>1331 fn write_memory(
1332 vcpu: &T,
1333 guest_mem: &GuestMemory,
1334 vaddr: GuestAddress,
1335 buf: &[u8],
1336 ) -> Result<()> {
1337 let sregs = vcpu.get_sregs().map_err(Error::ReadRegs)?;
1338 let mut total_written = 0u64;
1339 // Handle writes across page boundaries.
1340 while total_written < buf.len() as u64 {
1341 let (paddr, psize) = phys_addr(guest_mem, vaddr.0 + total_written, &sregs)?;
1342 let write_len = std::cmp::min(
1343 buf.len() as u64 - total_written,
1344 psize - (paddr & (psize - 1)),
1345 );
1346
1347 guest_mem
1348 .write_all_at_addr(
1349 &buf[total_written as usize..(total_written as usize + write_len as usize)],
1350 GuestAddress(paddr),
1351 )
1352 .map_err(Error::WritingGuestMemory)?;
1353 total_written += write_len;
1354 }
1355 Ok(())
1356 }
1357
enable_singlestep(vcpu: &T) -> Result<()>1358 fn enable_singlestep(vcpu: &T) -> Result<()> {
1359 vcpu.set_guest_debug(&[], true /* enable_singlestep */)
1360 .map_err(Error::EnableSinglestep)
1361 }
1362
get_max_hw_breakpoints(_vcpu: &T) -> Result<usize>1363 fn get_max_hw_breakpoints(_vcpu: &T) -> Result<usize> {
1364 Ok(4usize)
1365 }
1366
set_hw_breakpoints(vcpu: &T, breakpoints: &[GuestAddress]) -> Result<()>1367 fn set_hw_breakpoints(vcpu: &T, breakpoints: &[GuestAddress]) -> Result<()> {
1368 vcpu.set_guest_debug(breakpoints, false /* enable_singlestep */)
1369 .map_err(Error::SetHwBreakpoint)
1370 }
1371 }
1372
1373 #[cfg(feature = "gdb")]
1374 // return the translated address and the size of the page it resides in.
phys_addr(mem: &GuestMemory, vaddr: u64, sregs: &Sregs) -> Result<(u64, u64)>1375 fn phys_addr(mem: &GuestMemory, vaddr: u64, sregs: &Sregs) -> Result<(u64, u64)> {
1376 const CR0_PG_MASK: u64 = 1 << 31;
1377 const CR4_PAE_MASK: u64 = 1 << 5;
1378 const CR4_LA57_MASK: u64 = 1 << 12;
1379 const MSR_EFER_LMA: u64 = 1 << 10;
1380 // bits 12 through 51 are the address in a PTE.
1381 const PTE_ADDR_MASK: u64 = ((1 << 52) - 1) & !0x0fff;
1382 const PAGE_PRESENT: u64 = 0x1;
1383 const PAGE_PSE_MASK: u64 = 0x1 << 7;
1384
1385 const PAGE_SIZE_4K: u64 = 4 * 1024;
1386 const PAGE_SIZE_2M: u64 = 2 * 1024 * 1024;
1387 const PAGE_SIZE_1G: u64 = 1024 * 1024 * 1024;
1388
1389 fn next_pte(mem: &GuestMemory, curr_table_addr: u64, vaddr: u64, level: usize) -> Result<u64> {
1390 let ent: u64 = mem
1391 .read_obj_from_addr(GuestAddress(
1392 (curr_table_addr & PTE_ADDR_MASK) + page_table_offset(vaddr, level),
1393 ))
1394 .map_err(|_| Error::TranslatingVirtAddr)?;
1395 /* TODO - convert to a trace
1396 println!(
1397 "level {} vaddr {:x} table-addr {:x} mask {:x} ent {:x} offset {:x}",
1398 level,
1399 vaddr,
1400 curr_table_addr,
1401 PTE_ADDR_MASK,
1402 ent,
1403 page_table_offset(vaddr, level)
1404 );
1405 */
1406 if ent & PAGE_PRESENT == 0 {
1407 return Err(Error::PageNotPresent);
1408 }
1409 Ok(ent)
1410 }
1411
1412 // Get the offset in to the page of `vaddr`.
1413 fn page_offset(vaddr: u64, page_size: u64) -> u64 {
1414 vaddr & (page_size - 1)
1415 }
1416
1417 // Get the offset in to the page table of the given `level` specified by the virtual `address`.
1418 // `level` is 1 through 5 in x86_64 to handle the five levels of paging.
1419 fn page_table_offset(addr: u64, level: usize) -> u64 {
1420 let offset = (level - 1) * 9 + 12;
1421 ((addr >> offset) & 0x1ff) << 3
1422 }
1423
1424 if sregs.cr0 & CR0_PG_MASK == 0 {
1425 return Ok((vaddr, PAGE_SIZE_4K));
1426 }
1427
1428 if sregs.cr4 & CR4_PAE_MASK == 0 {
1429 return Err(Error::TranslatingVirtAddr);
1430 }
1431
1432 if sregs.efer & MSR_EFER_LMA != 0 {
1433 // TODO - check LA57
1434 if sregs.cr4 & CR4_LA57_MASK != 0 {
1435 todo!("handle LA57");
1436 }
1437 let p4_ent = next_pte(mem, sregs.cr3, vaddr, 4)?;
1438 let p3_ent = next_pte(mem, p4_ent, vaddr, 3)?;
1439 // TODO check if it's a 1G page with the PSE bit in p2_ent
1440 if p3_ent & PAGE_PSE_MASK != 0 {
1441 // It's a 1G page with the PSE bit in p3_ent
1442 let paddr = p3_ent & PTE_ADDR_MASK | page_offset(vaddr, PAGE_SIZE_1G);
1443 return Ok((paddr, PAGE_SIZE_1G));
1444 }
1445 let p2_ent = next_pte(mem, p3_ent, vaddr, 2)?;
1446 if p2_ent & PAGE_PSE_MASK != 0 {
1447 // It's a 2M page with the PSE bit in p2_ent
1448 let paddr = p2_ent & PTE_ADDR_MASK | page_offset(vaddr, PAGE_SIZE_2M);
1449 return Ok((paddr, PAGE_SIZE_2M));
1450 }
1451 let p1_ent = next_pte(mem, p2_ent, vaddr, 1)?;
1452 let paddr = p1_ent & PTE_ADDR_MASK | page_offset(vaddr, PAGE_SIZE_4K);
1453 return Ok((paddr, PAGE_SIZE_4K));
1454 }
1455 Err(Error::TranslatingVirtAddr)
1456 }
1457
1458 // OSC returned status register in CDW1
1459 const OSC_STATUS_UNSUPPORT_UUID: u32 = 0x4;
1460 // pci host bridge OSC returned control register in CDW3
1461 #[allow(dead_code)]
1462 const PCI_HB_OSC_CONTROL_PCIE_HP: u32 = 0x1;
1463 const PCI_HB_OSC_CONTROL_SHPC_HP: u32 = 0x2;
1464 #[allow(dead_code)]
1465 const PCI_HB_OSC_CONTROL_PCIE_PME: u32 = 0x4;
1466 const PCI_HB_OSC_CONTROL_PCIE_AER: u32 = 0x8;
1467 #[allow(dead_code)]
1468 const PCI_HB_OSC_CONTROL_PCIE_CAP: u32 = 0x10;
1469
1470 struct PciRootOSC {}
1471
1472 // Method (_OSC, 4, NotSerialized) // _OSC: Operating System Capabilities
1473 // {
1474 // CreateDWordField (Arg3, Zero, CDW1) // flag and return value
1475 // If (Arg0 == ToUUID ("33db4d5b-1ff7-401c-9657-7441c03dd766"))
1476 // {
1477 // CreateDWordField (Arg3, 8, CDW3) // control field
1478 // if ( 0 == (CDW1 & 0x01)) // Query flag ?
1479 // {
1480 // CDW3 &= !(SHPC_HP | AER)
1481 // }
1482 // } Else {
1483 // CDW1 |= UNSUPPORT_UUID
1484 // }
1485 // Return (Arg3)
1486 // }
1487 impl Aml for PciRootOSC {
to_aml_bytes(&self, aml: &mut Vec<u8>)1488 fn to_aml_bytes(&self, aml: &mut Vec<u8>) {
1489 let osc_uuid = "33DB4D5B-1FF7-401C-9657-7441C03DD766";
1490 // virtual pcie root port supports hotplug, pme, and pcie cap register, clear all
1491 // the other bits.
1492 let mask = !(PCI_HB_OSC_CONTROL_SHPC_HP | PCI_HB_OSC_CONTROL_PCIE_AER);
1493 aml::Method::new(
1494 "_OSC".into(),
1495 4,
1496 false,
1497 vec![
1498 &aml::CreateDWordField::new(
1499 &aml::Name::new_field_name("CDW1"),
1500 &aml::Arg(3),
1501 &aml::ZERO,
1502 ),
1503 &aml::If::new(
1504 &aml::Equal::new(&aml::Arg(0), &aml::Uuid::new(osc_uuid)),
1505 vec![
1506 &aml::CreateDWordField::new(
1507 &aml::Name::new_field_name("CDW3"),
1508 &aml::Arg(3),
1509 &(8_u8),
1510 ),
1511 &aml::If::new(
1512 &aml::Equal::new(
1513 &aml::ZERO,
1514 &aml::And::new(
1515 &aml::ZERO,
1516 &aml::Name::new_field_name("CDW1"),
1517 &aml::ONE,
1518 ),
1519 ),
1520 vec![&aml::And::new(
1521 &aml::Name::new_field_name("CDW3"),
1522 &mask,
1523 &aml::Name::new_field_name("CDW3"),
1524 )],
1525 ),
1526 ],
1527 ),
1528 &aml::Else::new(vec![&aml::Or::new(
1529 &aml::Name::new_field_name("CDW1"),
1530 &OSC_STATUS_UNSUPPORT_UUID,
1531 &aml::Name::new_field_name("CDW1"),
1532 )]),
1533 &aml::Return::new(&aml::Arg(3)),
1534 ],
1535 )
1536 .to_aml_bytes(aml)
1537 }
1538 }
1539
1540 impl X8664arch {
1541 /// Loads the bios from an open file.
1542 ///
1543 /// # Arguments
1544 ///
1545 /// * `mem` - The memory to be used by the guest.
1546 /// * `bios_image` - the File object for the specified bios
load_bios(mem: &GuestMemory, bios_image: &mut File) -> Result<()>1547 fn load_bios(mem: &GuestMemory, bios_image: &mut File) -> Result<()> {
1548 let bios_image_length = bios_image.get_len().map_err(Error::LoadBios)?;
1549 if bios_image_length >= FIRST_ADDR_PAST_32BITS {
1550 return Err(Error::LoadBios(io::Error::new(
1551 io::ErrorKind::InvalidData,
1552 format!(
1553 "bios was {} bytes, expected less than {}",
1554 bios_image_length, FIRST_ADDR_PAST_32BITS,
1555 ),
1556 )));
1557 }
1558
1559 let guest_slice = mem
1560 .get_slice_at_addr(bios_start(bios_image_length), bios_image_length as usize)
1561 .map_err(Error::SetupGuestMemory)?;
1562 bios_image
1563 .read_exact_at_volatile(guest_slice, 0)
1564 .map_err(Error::LoadBios)?;
1565 Ok(())
1566 }
1567
setup_pflash( pflash_image: File, block_size: u32, bios_size: u64, mmio_bus: &Bus, jail: Option<Minijail>, #[cfg(feature = "swap")] swap_controller: &mut Option<swap::SwapController>, ) -> Result<()>1568 fn setup_pflash(
1569 pflash_image: File,
1570 block_size: u32,
1571 bios_size: u64,
1572 mmio_bus: &Bus,
1573 jail: Option<Minijail>,
1574 #[cfg(feature = "swap")] swap_controller: &mut Option<swap::SwapController>,
1575 ) -> Result<()> {
1576 let size = pflash_image.metadata().map_err(Error::LoadPflash)?.len();
1577 let start = FIRST_ADDR_PAST_32BITS - bios_size - size;
1578 let pflash_image = Box::new(pflash_image);
1579
1580 #[cfg(any(target_os = "android", target_os = "linux"))]
1581 let fds = pflash_image.as_raw_descriptors();
1582
1583 let pflash = Pflash::new(pflash_image, block_size).map_err(Error::SetupPflash)?;
1584 let pflash: Arc<Mutex<dyn BusDevice>> = match jail {
1585 #[cfg(any(target_os = "android", target_os = "linux"))]
1586 Some(jail) => Arc::new(Mutex::new(
1587 ProxyDevice::new(
1588 pflash,
1589 jail,
1590 fds,
1591 #[cfg(feature = "swap")]
1592 swap_controller,
1593 )
1594 .map_err(Error::CreateProxyDevice)?,
1595 )),
1596 #[cfg(windows)]
1597 Some(_) => unreachable!(),
1598 None => Arc::new(Mutex::new(pflash)),
1599 };
1600 mmio_bus
1601 .insert(pflash, start, size)
1602 .map_err(Error::InsertBus)?;
1603
1604 Ok(())
1605 }
1606
1607 /// Loads the kernel from an open file.
1608 ///
1609 /// # Arguments
1610 ///
1611 /// * `mem` - The memory to be used by the guest.
1612 /// * `kernel_image` - the File object for the specified kernel.
1613 ///
1614 /// # Returns
1615 ///
1616 /// On success, returns the Linux x86_64 boot protocol parameters, the first address past the
1617 /// end of the kernel, and the entry point (initial `RIP` value).
load_kernel( mem: &GuestMemory, kernel_image: &mut File, ) -> Result<(boot_params, u64, GuestAddress)>1618 fn load_kernel(
1619 mem: &GuestMemory,
1620 kernel_image: &mut File,
1621 ) -> Result<(boot_params, u64, GuestAddress)> {
1622 let kernel_start = GuestAddress(KERNEL_START_OFFSET);
1623 match kernel_loader::load_elf64(mem, kernel_start, kernel_image, 0) {
1624 Ok(loaded_kernel) => {
1625 // ELF kernels don't contain a `boot_params` structure, so synthesize a default one.
1626 let boot_params = Default::default();
1627 Ok((
1628 boot_params,
1629 loaded_kernel.address_range.end,
1630 loaded_kernel.entry,
1631 ))
1632 }
1633 Err(kernel_loader::Error::InvalidMagicNumber) => {
1634 // The image failed to parse as ELF, so try to load it as a bzImage.
1635 let (boot_params, bzimage_end) =
1636 bzimage::load_bzimage(mem, kernel_start, kernel_image)
1637 .map_err(Error::LoadBzImage)?;
1638 let bzimage_entry = mem
1639 .checked_offset(kernel_start, KERNEL_64BIT_ENTRY_OFFSET)
1640 .ok_or(Error::KernelOffsetPastEnd)?;
1641 Ok((boot_params, bzimage_end, bzimage_entry))
1642 }
1643 Err(e) => Err(Error::LoadKernel(e)),
1644 }
1645 }
1646
1647 /// Configures the system memory space should be called once per vm before
1648 /// starting vcpu threads.
1649 ///
1650 /// # Arguments
1651 ///
1652 /// * `mem` - The memory to be used by the guest.
1653 /// * `cmdline` - the kernel commandline
1654 /// * `initrd_file` - an initial ramdisk image
setup_system_memory( mem: &GuestMemory, cmdline: &CStr, initrd_file: Option<File>, android_fstab: Option<File>, kernel_end: u64, params: boot_params, dump_device_tree_blob: Option<PathBuf>, device_tree_overlays: Vec<DtbOverlay>, ) -> Result<()>1655 pub fn setup_system_memory(
1656 mem: &GuestMemory,
1657 cmdline: &CStr,
1658 initrd_file: Option<File>,
1659 android_fstab: Option<File>,
1660 kernel_end: u64,
1661 params: boot_params,
1662 dump_device_tree_blob: Option<PathBuf>,
1663 device_tree_overlays: Vec<DtbOverlay>,
1664 ) -> Result<()> {
1665 kernel_loader::load_cmdline(mem, GuestAddress(CMDLINE_OFFSET), cmdline)
1666 .map_err(Error::LoadCmdline)?;
1667
1668 let mut setup_data = Vec::<SetupData>::new();
1669 if let Some(android_fstab) = android_fstab {
1670 setup_data.push(
1671 fdt::create_fdt(android_fstab, dump_device_tree_blob, device_tree_overlays)
1672 .map_err(Error::CreateFdt)?,
1673 );
1674 }
1675 setup_data.push(setup_data_rng_seed());
1676
1677 let setup_data = write_setup_data(
1678 mem,
1679 GuestAddress(SETUP_DATA_START),
1680 GuestAddress(SETUP_DATA_END),
1681 &setup_data,
1682 )?;
1683
1684 let initrd = match initrd_file {
1685 Some(mut initrd_file) => {
1686 let mut initrd_addr_max = u64::from(params.hdr.initrd_addr_max);
1687 // Default initrd_addr_max for old kernels (see Documentation/x86/boot.txt).
1688 if initrd_addr_max == 0 {
1689 initrd_addr_max = 0x37FFFFFF;
1690 }
1691
1692 let mem_max = mem.end_addr().offset() - 1;
1693 if initrd_addr_max > mem_max {
1694 initrd_addr_max = mem_max;
1695 }
1696
1697 let (initrd_start, initrd_size) = arch::load_image_high(
1698 mem,
1699 &mut initrd_file,
1700 GuestAddress(kernel_end),
1701 GuestAddress(initrd_addr_max),
1702 base::pagesize() as u64,
1703 )
1704 .map_err(Error::LoadInitrd)?;
1705 Some((initrd_start, initrd_size))
1706 }
1707 None => None,
1708 };
1709
1710 configure_system(
1711 mem,
1712 GuestAddress(KERNEL_START_OFFSET),
1713 GuestAddress(CMDLINE_OFFSET),
1714 cmdline.to_bytes().len() + 1,
1715 setup_data,
1716 initrd,
1717 params,
1718 )?;
1719 Ok(())
1720 }
1721
get_pcie_vcfg_mmio_range(mem: &GuestMemory, pcie_cfg_mmio: &AddressRange) -> AddressRange1722 fn get_pcie_vcfg_mmio_range(mem: &GuestMemory, pcie_cfg_mmio: &AddressRange) -> AddressRange {
1723 // Put PCIe VCFG region at a 2MB boundary after physical memory or 4gb, whichever is
1724 // greater.
1725 let ram_end_round_2mb = (mem.end_addr().offset() + 2 * MB - 1) / (2 * MB) * (2 * MB);
1726 let start = std::cmp::max(ram_end_round_2mb, 4 * GB);
1727 // Each pci device's ECAM size is 4kb and its vcfg size is 8kb
1728 let end = start + pcie_cfg_mmio.len().unwrap() * 2 - 1;
1729 AddressRange { start, end }
1730 }
1731
1732 /// Returns the high mmio range
get_high_mmio_range<V: Vm>(vm: &V) -> AddressRange1733 fn get_high_mmio_range<V: Vm>(vm: &V) -> AddressRange {
1734 let mem = vm.get_memory();
1735 let start = Self::get_pcie_vcfg_mmio_range(mem, &read_pcie_cfg_mmio()).end + 1;
1736
1737 let phys_mem_end = (1u64 << vm.get_guest_phys_addr_bits()) - 1;
1738 let high_mmio_end = std::cmp::min(phys_mem_end, HIGH_MMIO_MAX_END);
1739
1740 AddressRange {
1741 start,
1742 end: high_mmio_end,
1743 }
1744 }
1745
1746 /// This returns a minimal kernel command for this architecture
get_base_linux_cmdline() -> kernel_cmdline::Cmdline1747 pub fn get_base_linux_cmdline() -> kernel_cmdline::Cmdline {
1748 let mut cmdline = kernel_cmdline::Cmdline::new(CMDLINE_MAX_SIZE as usize);
1749 cmdline.insert_str("panic=-1").unwrap();
1750
1751 cmdline
1752 }
1753
1754 /// Sets up fw_cfg device.
1755 /// # Arguments
1756 ///
1757 /// * - `io_bus` - the IO bus object
1758 /// * - `fw_cfg_parameters` - command-line specified data to add to device. May contain
1759 /// all None fields if user did not specify data to add to the device
setup_fw_cfg_device( io_bus: &Bus, fw_cfg_parameters: Vec<FwCfgParameters>, bootorder_fw_cfg_blob: Vec<u8>, fw_cfg_jail: Option<Minijail>, #[cfg(feature = "swap")] swap_controller: &mut Option<swap::SwapController>, ) -> Result<()>1760 fn setup_fw_cfg_device(
1761 io_bus: &Bus,
1762 fw_cfg_parameters: Vec<FwCfgParameters>,
1763 bootorder_fw_cfg_blob: Vec<u8>,
1764 fw_cfg_jail: Option<Minijail>,
1765 #[cfg(feature = "swap")] swap_controller: &mut Option<swap::SwapController>,
1766 ) -> Result<()> {
1767 let fw_cfg = match devices::FwCfgDevice::new(FW_CFG_MAX_FILE_SLOTS, fw_cfg_parameters) {
1768 Ok(mut device) => {
1769 // this condition will only be true if the user specified at least one bootindex
1770 // option on the command line. If none were specified, bootorder_fw_cfg_blob will
1771 // only have a null byte (null terminator)
1772 if bootorder_fw_cfg_blob.len() > 1 {
1773 // Add boot order file to the device. If the file is not present, firmware may
1774 // not be able to boot.
1775 if let Err(err) = device.add_file(
1776 "bootorder",
1777 bootorder_fw_cfg_blob,
1778 devices::FwCfgItemType::GenericItem,
1779 ) {
1780 return Err(Error::CreateFwCfgDevice(err));
1781 }
1782 }
1783 device
1784 }
1785 Err(err) => {
1786 return Err(Error::CreateFwCfgDevice(err));
1787 }
1788 };
1789
1790 let fw_cfg: Arc<Mutex<dyn BusDevice>> = match fw_cfg_jail.as_ref() {
1791 #[cfg(any(target_os = "android", target_os = "linux"))]
1792 Some(jail) => {
1793 let jail_clone = jail.try_clone().map_err(Error::CloneJail)?;
1794 #[cfg(feature = "seccomp_trace")]
1795 debug!(
1796 "seccomp_trace {{\"event\": \"minijail_clone\", \"src_jail_addr\": \"0x{:x}\", \"dst_jail_addr\": \"0x{:x}\"}}",
1797 read_jail_addr(jail),
1798 read_jail_addr(&jail_clone)
1799 );
1800 Arc::new(Mutex::new(
1801 ProxyDevice::new(
1802 fw_cfg,
1803 jail_clone,
1804 Vec::new(),
1805 #[cfg(feature = "swap")]
1806 swap_controller,
1807 )
1808 .map_err(Error::CreateProxyDevice)?,
1809 ))
1810 }
1811 #[cfg(windows)]
1812 Some(_) => unreachable!(),
1813 None => Arc::new(Mutex::new(fw_cfg)),
1814 };
1815
1816 io_bus
1817 .insert(fw_cfg, FW_CFG_BASE_PORT, FW_CFG_WIDTH)
1818 .map_err(Error::InsertBus)?;
1819
1820 Ok(())
1821 }
1822
1823 /// Sets up the legacy x86 i8042/KBD platform device
1824 ///
1825 /// # Arguments
1826 ///
1827 /// * - `io_bus` - the IO bus object
1828 /// * - `pit_uses_speaker_port` - does the PIT use port 0x61 for the PC speaker
1829 /// * - `vm_evt_wrtube` - the event object which should receive exit events
setup_legacy_i8042_device( io_bus: &Bus, pit_uses_speaker_port: bool, vm_evt_wrtube: SendTube, ) -> Result<()>1830 pub fn setup_legacy_i8042_device(
1831 io_bus: &Bus,
1832 pit_uses_speaker_port: bool,
1833 vm_evt_wrtube: SendTube,
1834 ) -> Result<()> {
1835 let i8042 = Arc::new(Mutex::new(devices::I8042Device::new(
1836 vm_evt_wrtube.try_clone().map_err(Error::CloneTube)?,
1837 )));
1838
1839 if pit_uses_speaker_port {
1840 io_bus.insert(i8042, 0x062, 0x3).unwrap();
1841 } else {
1842 io_bus.insert(i8042, 0x061, 0x4).unwrap();
1843 }
1844
1845 Ok(())
1846 }
1847
1848 /// Sets up the legacy x86 CMOS/RTC platform device
1849 /// # Arguments
1850 ///
1851 /// * - `io_bus` - the IO bus object
1852 /// * - `mem_size` - the size in bytes of physical ram for the guest
setup_legacy_cmos_device( io_bus: &Bus, irq_chip: &mut dyn IrqChipX86_64, vm_control: Tube, mem_size: u64, ) -> anyhow::Result<()>1853 pub fn setup_legacy_cmos_device(
1854 io_bus: &Bus,
1855 irq_chip: &mut dyn IrqChipX86_64,
1856 vm_control: Tube,
1857 mem_size: u64,
1858 ) -> anyhow::Result<()> {
1859 let mem_regions = arch_memory_regions(mem_size, None);
1860
1861 let mem_below_4g = mem_regions
1862 .iter()
1863 .filter(|r| r.0.offset() < FIRST_ADDR_PAST_32BITS)
1864 .map(|r| r.1)
1865 .sum();
1866
1867 let mem_above_4g = mem_regions
1868 .iter()
1869 .filter(|r| r.0.offset() >= FIRST_ADDR_PAST_32BITS)
1870 .map(|r| r.1)
1871 .sum();
1872
1873 let irq_evt = devices::IrqEdgeEvent::new().context("cmos irq")?;
1874 let cmos = devices::cmos::Cmos::new(
1875 mem_below_4g,
1876 mem_above_4g,
1877 Utc::now,
1878 vm_control,
1879 irq_evt.try_clone().context("cmos irq clone")?,
1880 )
1881 .context("create cmos")?;
1882
1883 irq_chip
1884 .register_edge_irq_event(
1885 devices::cmos::RTC_IRQ as u32,
1886 &irq_evt,
1887 IrqEventSource::from_device(&cmos),
1888 )
1889 .context("cmos register irq")?;
1890 io_bus
1891 .insert(Arc::new(Mutex::new(cmos)), 0x70, 0x2)
1892 .context("cmos insert irq")?;
1893
1894 Ok(())
1895 }
1896
1897 /// Sets up the acpi devices for this platform and
1898 /// return the resources which is used to set the ACPI tables.
1899 ///
1900 /// # Arguments
1901 ///
1902 /// * - `io_bus` the I/O bus to add the devices to
1903 /// * - `resources` the SystemAllocator to allocate IO and MMIO for acpi devices.
1904 /// * - `suspend_evt` the event object which used to suspend the vm
1905 /// * - `sdts` ACPI system description tables
1906 /// * - `irq_chip` the IrqChip object for registering irq events
1907 /// * - `battery` indicate whether to create the battery
1908 /// * - `mmio_bus` the MMIO bus to add the devices to
1909 /// * - `pci_irqs` IRQ assignment of PCI devices. Tuples of (PCI address, gsi, PCI interrupt
1910 /// pin). Note that this matches one of the return values of generate_pci_root.
setup_acpi_devices( pci_root: Arc<Mutex<PciRoot>>, mem: &GuestMemory, io_bus: &Bus, resources: &mut SystemAllocator, suspend_evt: Event, vm_evt_wrtube: SendTube, sdts: Vec<SDT>, irq_chip: &mut dyn IrqChip, sci_irq: u32, battery: (Option<BatteryType>, Option<Minijail>), #[cfg_attr(windows, allow(unused_variables))] mmio_bus: &Bus, max_bus: u8, resume_notify_devices: &mut Vec<Arc<Mutex<dyn BusResumeDevice>>>, #[cfg(feature = "swap")] swap_controller: &mut Option<swap::SwapController>, #[cfg(any(target_os = "android", target_os = "linux"))] ac_adapter: bool, #[cfg(any(target_os = "android", target_os = "linux"))] guest_suspended_cvar: Option< Arc<(Mutex<bool>, Condvar)>, >, pci_irqs: &[(PciAddress, u32, PciInterruptPin)], ) -> Result<(acpi::AcpiDevResource, Option<BatControl>)>1911 pub fn setup_acpi_devices(
1912 pci_root: Arc<Mutex<PciRoot>>,
1913 mem: &GuestMemory,
1914 io_bus: &Bus,
1915 resources: &mut SystemAllocator,
1916 suspend_evt: Event,
1917 vm_evt_wrtube: SendTube,
1918 sdts: Vec<SDT>,
1919 irq_chip: &mut dyn IrqChip,
1920 sci_irq: u32,
1921 battery: (Option<BatteryType>, Option<Minijail>),
1922 #[cfg_attr(windows, allow(unused_variables))] mmio_bus: &Bus,
1923 max_bus: u8,
1924 resume_notify_devices: &mut Vec<Arc<Mutex<dyn BusResumeDevice>>>,
1925 #[cfg(feature = "swap")] swap_controller: &mut Option<swap::SwapController>,
1926 #[cfg(any(target_os = "android", target_os = "linux"))] ac_adapter: bool,
1927 #[cfg(any(target_os = "android", target_os = "linux"))] guest_suspended_cvar: Option<
1928 Arc<(Mutex<bool>, Condvar)>,
1929 >,
1930 pci_irqs: &[(PciAddress, u32, PciInterruptPin)],
1931 ) -> Result<(acpi::AcpiDevResource, Option<BatControl>)> {
1932 // The AML data for the acpi devices
1933 let mut amls = Vec::new();
1934
1935 let bat_control = if let Some(battery_type) = battery.0 {
1936 match battery_type {
1937 #[cfg(any(target_os = "android", target_os = "linux"))]
1938 BatteryType::Goldfish => {
1939 let irq_num = resources.allocate_irq().ok_or(Error::CreateBatDevices(
1940 arch::DeviceRegistrationError::AllocateIrq,
1941 ))?;
1942 let (control_tube, _mmio_base) = arch::sys::linux::add_goldfish_battery(
1943 &mut amls,
1944 battery.1,
1945 mmio_bus,
1946 irq_chip,
1947 irq_num,
1948 resources,
1949 #[cfg(feature = "swap")]
1950 swap_controller,
1951 )
1952 .map_err(Error::CreateBatDevices)?;
1953 Some(BatControl {
1954 type_: BatteryType::Goldfish,
1955 control_tube,
1956 })
1957 }
1958 #[cfg(windows)]
1959 _ => None,
1960 }
1961 } else {
1962 None
1963 };
1964
1965 let pm_alloc = resources.get_anon_alloc();
1966 let pm_iobase = match resources.io_allocator() {
1967 Some(io) => io
1968 .allocate_with_align(
1969 devices::acpi::ACPIPM_RESOURCE_LEN as u64,
1970 pm_alloc,
1971 "ACPIPM".to_string(),
1972 4, // must be 32-bit aligned
1973 )
1974 .map_err(Error::AllocateIOResouce)?,
1975 None => 0x600,
1976 };
1977
1978 let pcie_vcfg = aml::Name::new(
1979 "VCFG".into(),
1980 &Self::get_pcie_vcfg_mmio_range(mem, &read_pcie_cfg_mmio()).start,
1981 );
1982 pcie_vcfg.to_aml_bytes(&mut amls);
1983
1984 let pm_sci_evt = devices::IrqLevelEvent::new().map_err(Error::CreateEvent)?;
1985
1986 #[cfg(any(target_os = "android", target_os = "linux"))]
1987 let acdc = if ac_adapter {
1988 // Allocate GPE for AC adapter notfication
1989 let gpe = resources.allocate_gpe().ok_or(Error::AllocateGpe)?;
1990
1991 let alloc = resources.get_anon_alloc();
1992 let mmio_base = resources
1993 .allocate_mmio(
1994 devices::ac_adapter::ACDC_VIRT_MMIO_SIZE,
1995 alloc,
1996 "AcAdapter".to_string(),
1997 resources::AllocOptions::new().align(devices::ac_adapter::ACDC_VIRT_MMIO_SIZE),
1998 )
1999 .unwrap();
2000 let ac_adapter_dev = devices::ac_adapter::AcAdapter::new(mmio_base, gpe);
2001 let ac_dev = Arc::new(Mutex::new(ac_adapter_dev));
2002 mmio_bus
2003 .insert(
2004 ac_dev.clone(),
2005 mmio_base,
2006 devices::ac_adapter::ACDC_VIRT_MMIO_SIZE,
2007 )
2008 .unwrap();
2009
2010 ac_dev.lock().to_aml_bytes(&mut amls);
2011 Some(ac_dev)
2012 } else {
2013 None
2014 };
2015 #[cfg(windows)]
2016 let acdc = None;
2017
2018 //Virtual PMC
2019 #[cfg(any(target_os = "android", target_os = "linux"))]
2020 if let Some(guest_suspended_cvar) = guest_suspended_cvar {
2021 let alloc = resources.get_anon_alloc();
2022 let mmio_base = resources
2023 .allocate_mmio(
2024 devices::pmc_virt::VPMC_VIRT_MMIO_SIZE,
2025 alloc,
2026 "VirtualPmc".to_string(),
2027 resources::AllocOptions::new().align(devices::pmc_virt::VPMC_VIRT_MMIO_SIZE),
2028 )
2029 .unwrap();
2030
2031 let pmc_virtio_mmio =
2032 Arc::new(Mutex::new(VirtualPmc::new(mmio_base, guest_suspended_cvar)));
2033 mmio_bus
2034 .insert(
2035 pmc_virtio_mmio.clone(),
2036 mmio_base,
2037 devices::pmc_virt::VPMC_VIRT_MMIO_SIZE,
2038 )
2039 .unwrap();
2040 pmc_virtio_mmio.lock().to_aml_bytes(&mut amls);
2041 }
2042
2043 let mut pmresource = devices::ACPIPMResource::new(
2044 pm_sci_evt.try_clone().map_err(Error::CloneEvent)?,
2045 suspend_evt,
2046 vm_evt_wrtube,
2047 acdc,
2048 );
2049 pmresource.to_aml_bytes(&mut amls);
2050 irq_chip
2051 .register_level_irq_event(
2052 sci_irq,
2053 &pm_sci_evt,
2054 IrqEventSource::from_device(&pmresource),
2055 )
2056 .map_err(Error::RegisterIrqfd)?;
2057 pmresource.start();
2058
2059 let mut crs_entries: Vec<Box<dyn Aml>> = vec![
2060 Box::new(aml::AddressSpace::new_bus_number(0x0u16, max_bus as u16)),
2061 Box::new(aml::IO::new(0xcf8, 0xcf8, 1, 0x8)),
2062 ];
2063 for r in resources.mmio_pools() {
2064 let entry: Box<dyn Aml> = match (u32::try_from(r.start), u32::try_from(r.end)) {
2065 (Ok(start), Ok(end)) => Box::new(aml::AddressSpace::new_memory(
2066 aml::AddressSpaceCachable::NotCacheable,
2067 true,
2068 start,
2069 end,
2070 )),
2071 _ => Box::new(aml::AddressSpace::new_memory(
2072 aml::AddressSpaceCachable::NotCacheable,
2073 true,
2074 r.start,
2075 r.end,
2076 )),
2077 };
2078 crs_entries.push(entry);
2079 }
2080
2081 let prt_entries: Vec<aml::Package> = pci_irqs
2082 .iter()
2083 .map(|(pci_address, gsi, pci_intr_pin)| {
2084 aml::Package::new(vec![
2085 &pci_address.acpi_adr(),
2086 &pci_intr_pin.to_mask(),
2087 &aml::ZERO,
2088 gsi,
2089 ])
2090 })
2091 .collect();
2092
2093 aml::Device::new(
2094 "_SB_.PC00".into(),
2095 vec![
2096 &aml::Name::new("_HID".into(), &aml::EISAName::new("PNP0A08")),
2097 &aml::Name::new("_CID".into(), &aml::EISAName::new("PNP0A03")),
2098 &aml::Name::new("_ADR".into(), &aml::ZERO),
2099 &aml::Name::new("_SEG".into(), &aml::ZERO),
2100 &aml::Name::new("_UID".into(), &aml::ZERO),
2101 &aml::Name::new("SUPP".into(), &aml::ZERO),
2102 &aml::Name::new(
2103 "_CRS".into(),
2104 &aml::ResourceTemplate::new(crs_entries.iter().map(|b| b.as_ref()).collect()),
2105 ),
2106 &PciRootOSC {},
2107 &aml::Name::new(
2108 "_PRT".into(),
2109 &aml::Package::new(prt_entries.iter().map(|p| p as &dyn Aml).collect()),
2110 ),
2111 ],
2112 )
2113 .to_aml_bytes(&mut amls);
2114
2115 if let (Some(start), Some(len)) = (
2116 u32::try_from(read_pcie_cfg_mmio().start).ok(),
2117 read_pcie_cfg_mmio()
2118 .len()
2119 .and_then(|l| u32::try_from(l).ok()),
2120 ) {
2121 aml::Device::new(
2122 "_SB_.MB00".into(),
2123 vec![
2124 &aml::Name::new("_HID".into(), &aml::EISAName::new("PNP0C02")),
2125 &aml::Name::new(
2126 "_CRS".into(),
2127 &aml::ResourceTemplate::new(vec![&aml::Memory32Fixed::new(
2128 true, start, len,
2129 )]),
2130 ),
2131 ],
2132 )
2133 .to_aml_bytes(&mut amls);
2134 } else {
2135 warn!("Failed to create ACPI MMCFG region reservation");
2136 }
2137
2138 let root_bus = pci_root.lock().get_root_bus();
2139 let addresses = root_bus.lock().get_downstream_devices();
2140 for address in addresses {
2141 if let Some(acpi_path) = pci_root.lock().acpi_path(&address) {
2142 const DEEPEST_SLEEP_STATE: u32 = 3;
2143 aml::Device::new(
2144 (*acpi_path).into(),
2145 vec![
2146 &aml::Name::new("_ADR".into(), &address.acpi_adr()),
2147 &aml::Name::new(
2148 "_PRW".into(),
2149 &aml::Package::new(vec![&PM_WAKEUP_GPIO, &DEEPEST_SLEEP_STATE]),
2150 ),
2151 ],
2152 )
2153 .to_aml_bytes(&mut amls);
2154 }
2155 }
2156
2157 let pm = Arc::new(Mutex::new(pmresource));
2158 io_bus
2159 .insert(
2160 pm.clone(),
2161 pm_iobase,
2162 devices::acpi::ACPIPM_RESOURCE_LEN as u64,
2163 )
2164 .unwrap();
2165 resume_notify_devices.push(pm.clone());
2166
2167 Ok((
2168 acpi::AcpiDevResource {
2169 amls,
2170 pm_iobase,
2171 pm,
2172 sdts,
2173 },
2174 bat_control,
2175 ))
2176 }
2177
2178 /// Sets up the serial devices for this platform. Returns a list of configured serial devices.
2179 ///
2180 /// # Arguments
2181 ///
2182 /// * - `irq_chip` the IrqChip object for registering irq events
2183 /// * - `io_bus` the I/O bus to add the devices to
2184 /// * - `serial_parameters` - definitions for how the serial devices should be configured
setup_serial_devices( protection_type: ProtectionType, irq_chip: &mut dyn IrqChip, io_bus: &Bus, serial_parameters: &BTreeMap<(SerialHardware, u8), SerialParameters>, serial_jail: Option<Minijail>, #[cfg(feature = "swap")] swap_controller: &mut Option<swap::SwapController>, ) -> Result<Vec<SerialDeviceInfo>>2185 pub fn setup_serial_devices(
2186 protection_type: ProtectionType,
2187 irq_chip: &mut dyn IrqChip,
2188 io_bus: &Bus,
2189 serial_parameters: &BTreeMap<(SerialHardware, u8), SerialParameters>,
2190 serial_jail: Option<Minijail>,
2191 #[cfg(feature = "swap")] swap_controller: &mut Option<swap::SwapController>,
2192 ) -> Result<Vec<SerialDeviceInfo>> {
2193 let com_evt_1_3 = devices::IrqEdgeEvent::new().map_err(Error::CreateEvent)?;
2194 let com_evt_2_4 = devices::IrqEdgeEvent::new().map_err(Error::CreateEvent)?;
2195
2196 let serial_devices = arch::add_serial_devices(
2197 protection_type,
2198 io_bus,
2199 (X86_64_SERIAL_1_3_IRQ, com_evt_1_3.get_trigger()),
2200 (X86_64_SERIAL_2_4_IRQ, com_evt_2_4.get_trigger()),
2201 serial_parameters,
2202 serial_jail,
2203 #[cfg(feature = "swap")]
2204 swap_controller,
2205 )
2206 .map_err(Error::CreateSerialDevices)?;
2207
2208 let source = IrqEventSource {
2209 device_id: Serial::device_id(),
2210 queue_id: 0,
2211 device_name: Serial::debug_label(),
2212 };
2213 irq_chip
2214 .register_edge_irq_event(X86_64_SERIAL_1_3_IRQ, &com_evt_1_3, source.clone())
2215 .map_err(Error::RegisterIrqfd)?;
2216 irq_chip
2217 .register_edge_irq_event(X86_64_SERIAL_2_4_IRQ, &com_evt_2_4, source)
2218 .map_err(Error::RegisterIrqfd)?;
2219
2220 Ok(serial_devices)
2221 }
2222
setup_debugcon_devices( protection_type: ProtectionType, io_bus: &Bus, serial_parameters: &BTreeMap<(SerialHardware, u8), SerialParameters>, debugcon_jail: Option<Minijail>, #[cfg(feature = "swap")] swap_controller: &mut Option<swap::SwapController>, ) -> Result<()>2223 fn setup_debugcon_devices(
2224 protection_type: ProtectionType,
2225 io_bus: &Bus,
2226 serial_parameters: &BTreeMap<(SerialHardware, u8), SerialParameters>,
2227 debugcon_jail: Option<Minijail>,
2228 #[cfg(feature = "swap")] swap_controller: &mut Option<swap::SwapController>,
2229 ) -> Result<()> {
2230 for param in serial_parameters.values() {
2231 if param.hardware != SerialHardware::Debugcon {
2232 continue;
2233 }
2234
2235 let mut preserved_fds = Vec::new();
2236 let con = param
2237 .create_serial_device::<Debugcon>(
2238 protection_type,
2239 // Debugcon doesn't use the interrupt event
2240 &Event::new().map_err(Error::CreateEvent)?,
2241 &mut preserved_fds,
2242 )
2243 .map_err(Error::CreateDebugconDevice)?;
2244
2245 let con: Arc<Mutex<dyn BusDevice>> = match debugcon_jail.as_ref() {
2246 #[cfg(any(target_os = "android", target_os = "linux"))]
2247 Some(jail) => {
2248 let jail_clone = jail.try_clone().map_err(Error::CloneJail)?;
2249 #[cfg(feature = "seccomp_trace")]
2250 debug!(
2251 "seccomp_trace {{\"event\": \"minijail_clone\", \"src_jail_addr\": \"0x{:x}\", \"dst_jail_addr\": \"0x{:x}\"}}",
2252 read_jail_addr(jail),
2253 read_jail_addr(&jail_clone)
2254 );
2255 Arc::new(Mutex::new(
2256 ProxyDevice::new(
2257 con,
2258 jail_clone,
2259 preserved_fds,
2260 #[cfg(feature = "swap")]
2261 swap_controller,
2262 )
2263 .map_err(Error::CreateProxyDevice)?,
2264 ))
2265 }
2266 #[cfg(windows)]
2267 Some(_) => unreachable!(),
2268 None => Arc::new(Mutex::new(con)),
2269 };
2270 io_bus
2271 .insert(con.clone(), param.debugcon_port.into(), 1)
2272 .map_err(Error::InsertBus)?;
2273 }
2274
2275 Ok(())
2276 }
2277 }
2278
2279 #[sorted]
2280 #[derive(Error, Debug)]
2281 pub enum MsrError {
2282 #[error("CPU not support. Only intel CPUs support ITMT.")]
2283 CpuUnSupport,
2284 #[error("msr must be unique: {0}")]
2285 MsrDuplicate(u32),
2286 }
2287
2288 #[derive(Error, Debug)]
2289 pub enum HybridSupportError {
2290 #[error("Host CPU doesn't support hybrid architecture.")]
2291 UnsupportedHostCpu,
2292 }
2293
2294 /// The wrapper for CPUID call functions.
2295 pub struct CpuIdCall {
2296 /// __cpuid_count or a fake function for test.
2297 cpuid_count: unsafe fn(u32, u32) -> CpuidResult,
2298 /// __cpuid or a fake function for test.
2299 cpuid: unsafe fn(u32) -> CpuidResult,
2300 }
2301
2302 impl CpuIdCall {
new( cpuid_count: unsafe fn(u32, u32) -> CpuidResult, cpuid: unsafe fn(u32) -> CpuidResult, ) -> CpuIdCall2303 pub fn new(
2304 cpuid_count: unsafe fn(u32, u32) -> CpuidResult,
2305 cpuid: unsafe fn(u32) -> CpuidResult,
2306 ) -> CpuIdCall {
2307 CpuIdCall { cpuid_count, cpuid }
2308 }
2309 }
2310
2311 /// Check if host supports hybrid CPU feature. The check include:
2312 /// 1. Check if CPUID.1AH exists. CPUID.1AH is hybrid information enumeration leaf.
2313 /// 2. Check if CPUID.07H.00H:EDX[bit 15] sets. This bit means the processor is identified as a
2314 /// hybrid part.
2315 /// 3. Check if CPUID.1AH:EAX sets. The hybrid core type is set in EAX.
2316 ///
2317 /// # Arguments
2318 ///
2319 /// * - `cpuid` the wrapped cpuid functions used to get CPUID info.
check_host_hybrid_support(cpuid: &CpuIdCall) -> std::result::Result<(), HybridSupportError>2320 pub fn check_host_hybrid_support(cpuid: &CpuIdCall) -> std::result::Result<(), HybridSupportError> {
2321 // CPUID.0H.EAX returns maximum input value for basic CPUID information.
2322 //
2323 // SAFETY:
2324 // Safe because we pass 0 for this call and the host supports the
2325 // `cpuid` instruction.
2326 let mut cpuid_entry = unsafe { (cpuid.cpuid)(0x0) };
2327 if cpuid_entry.eax < 0x1A {
2328 return Err(HybridSupportError::UnsupportedHostCpu);
2329 }
2330 // SAFETY:
2331 // Safe because we pass 0x7 and 0 for this call and the host supports the
2332 // `cpuid` instruction.
2333 cpuid_entry = unsafe { (cpuid.cpuid_count)(0x7, 0) };
2334 if cpuid_entry.edx & 1 << EDX_HYBRID_CPU_SHIFT == 0 {
2335 return Err(HybridSupportError::UnsupportedHostCpu);
2336 }
2337 // From SDM, if a value entered for CPUID.EAX is less than or equal to the
2338 // maximum input value and the leaf is not supported on that processor then
2339 // 0 is returned in all the registers.
2340 // For the CPU with hybrid support, its CPUID.1AH.EAX shouldn't be zero.
2341 //
2342 // SAFETY:
2343 // Safe because we pass 0 for this call and the host supports the
2344 // `cpuid` instruction.
2345 cpuid_entry = unsafe { (cpuid.cpuid)(0x1A) };
2346 if cpuid_entry.eax == 0 {
2347 return Err(HybridSupportError::UnsupportedHostCpu);
2348 }
2349 Ok(())
2350 }
2351
2352 #[cfg(test)]
2353 mod tests {
2354 use std::mem::size_of;
2355
2356 use super::*;
2357
2358 const TEST_MEMORY_SIZE: u64 = 2 * GB;
2359
setup()2360 fn setup() {
2361 let pcie_ecam = Some(AddressRange::from_start_and_size(3 * GB, 256 * MB).unwrap());
2362 let pci_start = Some(2 * GB);
2363 init_low_memory_layout(pcie_ecam, pci_start);
2364 }
2365
2366 #[test]
regions_lt_4gb_nobios()2367 fn regions_lt_4gb_nobios() {
2368 setup();
2369 let regions = arch_memory_regions(512 * MB, /* bios_size */ None);
2370 assert_eq!(1, regions.len());
2371 assert_eq!(GuestAddress(START_OF_RAM_32BITS), regions[0].0);
2372 assert_eq!(1u64 << 29, regions[0].1);
2373 }
2374
2375 #[test]
regions_gt_4gb_nobios()2376 fn regions_gt_4gb_nobios() {
2377 setup();
2378 let size = 4 * GB + 0x8000;
2379 let regions = arch_memory_regions(size, /* bios_size */ None);
2380 assert_eq!(2, regions.len());
2381 assert_eq!(GuestAddress(START_OF_RAM_32BITS), regions[0].0);
2382 assert_eq!(GuestAddress(4 * GB), regions[1].0);
2383 assert_eq!(4 * GB + 0x8000, regions[0].1 + regions[1].1);
2384 }
2385
2386 #[test]
regions_lt_4gb_bios()2387 fn regions_lt_4gb_bios() {
2388 setup();
2389 let bios_len = 1 * MB;
2390 let regions = arch_memory_regions(512 * MB, Some(bios_len));
2391 assert_eq!(2, regions.len());
2392 assert_eq!(GuestAddress(START_OF_RAM_32BITS), regions[0].0);
2393 assert_eq!(512 * MB, regions[0].1);
2394 assert_eq!(
2395 GuestAddress(FIRST_ADDR_PAST_32BITS - bios_len),
2396 regions[1].0
2397 );
2398 assert_eq!(bios_len, regions[1].1);
2399 }
2400
2401 #[test]
regions_gt_4gb_bios()2402 fn regions_gt_4gb_bios() {
2403 setup();
2404 let bios_len = 1 * MB;
2405 let regions = arch_memory_regions(4 * GB + 0x8000, Some(bios_len));
2406 assert_eq!(3, regions.len());
2407 assert_eq!(GuestAddress(START_OF_RAM_32BITS), regions[0].0);
2408 assert_eq!(
2409 GuestAddress(FIRST_ADDR_PAST_32BITS - bios_len),
2410 regions[1].0
2411 );
2412 assert_eq!(bios_len, regions[1].1);
2413 assert_eq!(GuestAddress(4 * GB), regions[2].0);
2414 }
2415
2416 #[test]
regions_eq_4gb_nobios()2417 fn regions_eq_4gb_nobios() {
2418 setup();
2419 // Test with exact size of 4GB - the overhead.
2420 let regions = arch_memory_regions(
2421 TEST_MEMORY_SIZE - START_OF_RAM_32BITS,
2422 /* bios_size */ None,
2423 );
2424 dbg!(®ions);
2425 assert_eq!(1, regions.len());
2426 assert_eq!(GuestAddress(START_OF_RAM_32BITS), regions[0].0);
2427 assert_eq!(TEST_MEMORY_SIZE - START_OF_RAM_32BITS, regions[0].1);
2428 }
2429
2430 #[test]
regions_eq_4gb_bios()2431 fn regions_eq_4gb_bios() {
2432 setup();
2433 // Test with exact size of 4GB - the overhead.
2434 let bios_len = 1 * MB;
2435 let regions = arch_memory_regions(TEST_MEMORY_SIZE - START_OF_RAM_32BITS, Some(bios_len));
2436 assert_eq!(2, regions.len());
2437 assert_eq!(GuestAddress(START_OF_RAM_32BITS), regions[0].0);
2438 assert_eq!(TEST_MEMORY_SIZE - START_OF_RAM_32BITS, regions[0].1);
2439 assert_eq!(
2440 GuestAddress(FIRST_ADDR_PAST_32BITS - bios_len),
2441 regions[1].0
2442 );
2443 assert_eq!(bios_len, regions[1].1);
2444 }
2445
2446 #[test]
check_pci_mmio_layout()2447 fn check_pci_mmio_layout() {
2448 setup();
2449
2450 assert_eq!(read_pci_mmio_before_32bit().start, 2 * GB);
2451 assert_eq!(read_pcie_cfg_mmio().start, 3 * GB);
2452 assert_eq!(read_pcie_cfg_mmio().len().unwrap(), 256 * MB);
2453 }
2454
2455 #[test]
check_32bit_gap_size_alignment()2456 fn check_32bit_gap_size_alignment() {
2457 setup();
2458 // pci_low_start is 256 MB aligned to be friendly for MTRR mappings.
2459 assert_eq!(read_pci_mmio_before_32bit().start % (256 * MB), 0);
2460 }
2461
2462 #[test]
write_setup_data_empty()2463 fn write_setup_data_empty() {
2464 let mem = GuestMemory::new(&[(GuestAddress(0), 0x2_0000)]).unwrap();
2465 let setup_data = [];
2466 let setup_data_addr = write_setup_data(
2467 &mem,
2468 GuestAddress(0x1000),
2469 GuestAddress(0x2000),
2470 &setup_data,
2471 )
2472 .expect("write_setup_data");
2473 assert_eq!(setup_data_addr, None);
2474 }
2475
2476 #[test]
write_setup_data_two_of_them()2477 fn write_setup_data_two_of_them() {
2478 let mem = GuestMemory::new(&[(GuestAddress(0), 0x2_0000)]).unwrap();
2479
2480 let entry1_addr = GuestAddress(0x1000);
2481 let entry1_next_addr = entry1_addr;
2482 let entry1_len_addr = entry1_addr.checked_add(12).unwrap();
2483 let entry1_data_addr = entry1_addr.checked_add(16).unwrap();
2484 let entry1_data = [0x55u8; 13];
2485 let entry1_size = (size_of::<setup_data_hdr>() + entry1_data.len()) as u64;
2486 let entry1_align = 3;
2487
2488 let entry2_addr = GuestAddress(entry1_addr.offset() + entry1_size + entry1_align);
2489 let entry2_next_addr = entry2_addr;
2490 let entry2_len_addr = entry2_addr.checked_add(12).unwrap();
2491 let entry2_data_addr = entry2_addr.checked_add(16).unwrap();
2492 let entry2_data = [0xAAu8; 9];
2493
2494 let setup_data = [
2495 SetupData {
2496 data: entry1_data.to_vec(),
2497 type_: SetupDataType::Dtb,
2498 },
2499 SetupData {
2500 data: entry2_data.to_vec(),
2501 type_: SetupDataType::Dtb,
2502 },
2503 ];
2504
2505 let setup_data_head_addr = write_setup_data(
2506 &mem,
2507 GuestAddress(0x1000),
2508 GuestAddress(0x2000),
2509 &setup_data,
2510 )
2511 .expect("write_setup_data");
2512 assert_eq!(setup_data_head_addr, Some(entry1_addr));
2513
2514 assert_eq!(
2515 mem.read_obj_from_addr::<u64>(entry1_next_addr).unwrap(),
2516 entry2_addr.offset()
2517 );
2518 assert_eq!(
2519 mem.read_obj_from_addr::<u32>(entry1_len_addr).unwrap(),
2520 entry1_data.len() as u32
2521 );
2522 assert_eq!(
2523 mem.read_obj_from_addr::<[u8; 13]>(entry1_data_addr)
2524 .unwrap(),
2525 entry1_data
2526 );
2527
2528 assert_eq!(mem.read_obj_from_addr::<u64>(entry2_next_addr).unwrap(), 0);
2529 assert_eq!(
2530 mem.read_obj_from_addr::<u32>(entry2_len_addr).unwrap(),
2531 entry2_data.len() as u32
2532 );
2533 assert_eq!(
2534 mem.read_obj_from_addr::<[u8; 9]>(entry2_data_addr).unwrap(),
2535 entry2_data
2536 );
2537 }
2538 }
2539