1 // Copyright 2017 The Chromium OS Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 mod fdt;
6
7 const E820_RAM: u32 = 1;
8 const SETUP_DTB: u32 = 2;
9 const X86_64_FDT_MAX_SIZE: u64 = 0x200000;
10
11 #[allow(dead_code)]
12 #[allow(non_upper_case_globals)]
13 #[allow(non_camel_case_types)]
14 #[allow(non_snake_case)]
15 mod bootparam;
16
17 // boot_params is just a series of ints, it is safe to initialize it.
18 unsafe impl data_model::DataInit for bootparam::boot_params {}
19
20 #[allow(dead_code)]
21 #[allow(non_upper_case_globals)]
22 mod msr_index;
23
24 #[allow(dead_code)]
25 #[allow(non_upper_case_globals)]
26 #[allow(non_camel_case_types)]
27 #[allow(clippy::all)]
28 mod mpspec;
29 // These mpspec types are only data, reading them from data is a safe initialization.
30 unsafe impl data_model::DataInit for mpspec::mpc_bus {}
31 unsafe impl data_model::DataInit for mpspec::mpc_cpu {}
32 unsafe impl data_model::DataInit for mpspec::mpc_intsrc {}
33 unsafe impl data_model::DataInit for mpspec::mpc_ioapic {}
34 unsafe impl data_model::DataInit for mpspec::mpc_table {}
35 unsafe impl data_model::DataInit for mpspec::mpc_lintsrc {}
36 unsafe impl data_model::DataInit for mpspec::mpf_intel {}
37
38 mod bzimage;
39 mod cpuid;
40 mod gdt;
41 mod interrupts;
42 mod mptable;
43 mod regs;
44 mod smbios;
45
46 use std::collections::BTreeMap;
47 use std::error::Error as StdError;
48 use std::ffi::{CStr, CString};
49 use std::fmt::{self, Display};
50 use std::fs::File;
51 use std::io::{self, Seek};
52 use std::mem;
53 use std::sync::Arc;
54
55 use crate::bootparam::boot_params;
56 use arch::{RunnableLinuxVm, VmComponents, VmImage};
57 use devices::{get_serial_tty_string, PciConfigIo, PciDevice, PciInterruptPin, SerialParameters};
58 use io_jail::Minijail;
59 use kvm::*;
60 use remain::sorted;
61 use resources::SystemAllocator;
62 use sync::Mutex;
63 use sys_util::{Clock, EventFd, GuestAddress, GuestMemory, GuestMemoryError};
64
65 #[sorted]
66 #[derive(Debug)]
67 pub enum Error {
68 CloneEventFd(sys_util::Error),
69 Cmdline(kernel_cmdline::Error),
70 ConfigureSystem,
71 CreateDevices(Box<dyn StdError>),
72 CreateEventFd(sys_util::Error),
73 CreateFdt(arch::fdt::Error),
74 CreateIrqChip(sys_util::Error),
75 CreateKvm(sys_util::Error),
76 CreatePciRoot(arch::DeviceRegistrationError),
77 CreatePit(sys_util::Error),
78 CreatePitDevice(devices::PitError),
79 CreateSerialDevices(arch::DeviceRegistrationError),
80 CreateSocket(io::Error),
81 CreateVcpu(sys_util::Error),
82 CreateVm(sys_util::Error),
83 E820Configuration,
84 KernelOffsetPastEnd,
85 LoadBios(io::Error),
86 LoadBzImage(bzimage::Error),
87 LoadCmdline(kernel_loader::Error),
88 LoadInitrd(arch::LoadImageError),
89 LoadKernel(kernel_loader::Error),
90 RegisterIrqfd(sys_util::Error),
91 RegisterVsock(arch::DeviceRegistrationError),
92 SetLint(interrupts::Error),
93 SetTssAddr(sys_util::Error),
94 SetupCpuid(cpuid::Error),
95 SetupFpu(regs::Error),
96 SetupGuestMemory(GuestMemoryError),
97 SetupMptable(mptable::Error),
98 SetupMsrs(regs::Error),
99 SetupRegs(regs::Error),
100 SetupSmbios(smbios::Error),
101 SetupSregs(regs::Error),
102 ZeroPagePastRamEnd,
103 ZeroPageSetup,
104 }
105
106 impl Display for Error {
107 #[remain::check]
fmt(&self, f: &mut fmt::Formatter) -> fmt::Result108 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
109 use self::Error::*;
110
111 #[sorted]
112 match self {
113 CloneEventFd(e) => write!(f, "unable to clone an EventFd: {}", e),
114 Cmdline(e) => write!(f, "the given kernel command line was invalid: {}", e),
115 ConfigureSystem => write!(f, "error configuring the system"),
116 CreateDevices(e) => write!(f, "error creating devices: {}", e),
117 CreateEventFd(e) => write!(f, "unable to make an EventFd: {}", e),
118 CreateFdt(e) => write!(f, "failed to create fdt: {}", e),
119 CreateIrqChip(e) => write!(f, "failed to create irq chip: {}", e),
120 CreateKvm(e) => write!(f, "failed to open /dev/kvm: {}", e),
121 CreatePciRoot(e) => write!(f, "failed to create a PCI root hub: {}", e),
122 CreatePit(e) => write!(f, "unable to create PIT: {}", e),
123 CreatePitDevice(e) => write!(f, "unable to make PIT device: {}", e),
124 CreateSerialDevices(e) => write!(f, "unable to create serial devices: {}", e),
125 CreateSocket(e) => write!(f, "failed to create socket: {}", e),
126 CreateVcpu(e) => write!(f, "failed to create VCPU: {}", e),
127 CreateVm(e) => write!(f, "failed to create VM: {}", e),
128 E820Configuration => write!(f, "invalid e820 setup params"),
129 KernelOffsetPastEnd => write!(f, "the kernel extends past the end of RAM"),
130 LoadBios(e) => write!(f, "error loading bios: {}", e),
131 LoadBzImage(e) => write!(f, "error loading kernel bzImage: {}", e),
132 LoadCmdline(e) => write!(f, "error loading command line: {}", e),
133 LoadInitrd(e) => write!(f, "error loading initrd: {}", e),
134 LoadKernel(e) => write!(f, "error loading Kernel: {}", e),
135 RegisterIrqfd(e) => write!(f, "error registering an IrqFd: {}", e),
136 RegisterVsock(e) => write!(f, "error registering virtual socket device: {}", e),
137 SetLint(e) => write!(f, "failed to set interrupts: {}", e),
138 SetTssAddr(e) => write!(f, "failed to set tss addr: {}", e),
139 SetupCpuid(e) => write!(f, "failed to set up cpuid: {}", e),
140 SetupFpu(e) => write!(f, "failed to set up FPU: {}", e),
141 SetupGuestMemory(e) => write!(f, "failed to set up guest memory: {}", e),
142 SetupMptable(e) => write!(f, "failed to set up mptable: {}", e),
143 SetupMsrs(e) => write!(f, "failed to set up MSRs: {}", e),
144 SetupRegs(e) => write!(f, "failed to set up registers: {}", e),
145 SetupSmbios(e) => write!(f, "failed to set up SMBIOS: {}", e),
146 SetupSregs(e) => write!(f, "failed to set up sregs: {}", e),
147 ZeroPagePastRamEnd => write!(f, "the zero page extends past the end of guest_mem"),
148 ZeroPageSetup => write!(f, "error writing the zero page of guest memory"),
149 }
150 }
151 }
152
153 pub type Result<T> = std::result::Result<T, Error>;
154
155 impl std::error::Error for Error {}
156
157 pub struct X8664arch;
158
159 const BOOT_STACK_POINTER: u64 = 0x8000;
160 const MEM_32BIT_GAP_SIZE: u64 = (768 << 20);
161 const FIRST_ADDR_PAST_32BITS: u64 = (1 << 32);
162 const KERNEL_64BIT_ENTRY_OFFSET: u64 = 0x200;
163 const ZERO_PAGE_OFFSET: u64 = 0x7000;
164 /// The x86 reset vector for i386+ and x86_64 puts the processor into an "unreal mode" where it
165 /// can access the last 1 MB of the 32-bit address space in 16-bit mode, and starts the instruction
166 /// pointer at the effective physical address 0xFFFFFFF0.
167 const BIOS_LEN: usize = 1 << 20;
168 const BIOS_START: u64 = FIRST_ADDR_PAST_32BITS - (BIOS_LEN as u64);
169
170 const KERNEL_START_OFFSET: u64 = 0x200000;
171 const CMDLINE_OFFSET: u64 = 0x20000;
172 const CMDLINE_MAX_SIZE: u64 = KERNEL_START_OFFSET - CMDLINE_OFFSET;
173 const X86_64_SERIAL_1_3_IRQ: u32 = 4;
174 const X86_64_SERIAL_2_4_IRQ: u32 = 3;
175 const X86_64_IRQ_BASE: u32 = 5;
176
configure_system( guest_mem: &GuestMemory, _mem_size: u64, kernel_addr: GuestAddress, cmdline_addr: GuestAddress, cmdline_size: usize, num_cpus: u8, pci_irqs: Vec<(u32, PciInterruptPin)>, setup_data: Option<GuestAddress>, initrd: Option<(GuestAddress, usize)>, mut params: boot_params, ) -> Result<()>177 fn configure_system(
178 guest_mem: &GuestMemory,
179 _mem_size: u64,
180 kernel_addr: GuestAddress,
181 cmdline_addr: GuestAddress,
182 cmdline_size: usize,
183 num_cpus: u8,
184 pci_irqs: Vec<(u32, PciInterruptPin)>,
185 setup_data: Option<GuestAddress>,
186 initrd: Option<(GuestAddress, usize)>,
187 mut params: boot_params,
188 ) -> Result<()> {
189 const EBDA_START: u64 = 0x0009fc00;
190 const KERNEL_BOOT_FLAG_MAGIC: u16 = 0xaa55;
191 const KERNEL_HDR_MAGIC: u32 = 0x53726448;
192 const KERNEL_LOADER_OTHER: u8 = 0xff;
193 const KERNEL_MIN_ALIGNMENT_BYTES: u32 = 0x1000000; // Must be non-zero.
194 let first_addr_past_32bits = GuestAddress(FIRST_ADDR_PAST_32BITS);
195 let end_32bit_gap_start = GuestAddress(FIRST_ADDR_PAST_32BITS - MEM_32BIT_GAP_SIZE);
196
197 // Note that this puts the mptable at 0x0 in guest physical memory.
198 mptable::setup_mptable(guest_mem, num_cpus, pci_irqs).map_err(Error::SetupMptable)?;
199
200 smbios::setup_smbios(guest_mem).map_err(Error::SetupSmbios)?;
201
202 params.hdr.type_of_loader = KERNEL_LOADER_OTHER;
203 params.hdr.boot_flag = KERNEL_BOOT_FLAG_MAGIC;
204 params.hdr.header = KERNEL_HDR_MAGIC;
205 params.hdr.cmd_line_ptr = cmdline_addr.offset() as u32;
206 params.hdr.cmdline_size = cmdline_size as u32;
207 params.hdr.kernel_alignment = KERNEL_MIN_ALIGNMENT_BYTES;
208 if let Some(setup_data) = setup_data {
209 params.hdr.setup_data = setup_data.offset();
210 }
211 if let Some((initrd_addr, initrd_size)) = initrd {
212 params.hdr.ramdisk_image = initrd_addr.offset() as u32;
213 params.hdr.ramdisk_size = initrd_size as u32;
214 }
215
216 add_e820_entry(&mut params, 0, EBDA_START, E820_RAM)?;
217
218 let mem_end = guest_mem.end_addr();
219 if mem_end < end_32bit_gap_start {
220 add_e820_entry(
221 &mut params,
222 kernel_addr.offset() as u64,
223 mem_end.offset_from(kernel_addr) as u64,
224 E820_RAM,
225 )?;
226 } else {
227 add_e820_entry(
228 &mut params,
229 kernel_addr.offset() as u64,
230 end_32bit_gap_start.offset_from(kernel_addr) as u64,
231 E820_RAM,
232 )?;
233 if mem_end > first_addr_past_32bits {
234 add_e820_entry(
235 &mut params,
236 first_addr_past_32bits.offset() as u64,
237 mem_end.offset_from(first_addr_past_32bits) as u64,
238 E820_RAM,
239 )?;
240 }
241 }
242
243 let zero_page_addr = GuestAddress(ZERO_PAGE_OFFSET);
244 guest_mem
245 .checked_offset(zero_page_addr, mem::size_of::<boot_params>() as u64)
246 .ok_or(Error::ZeroPagePastRamEnd)?;
247 guest_mem
248 .write_obj_at_addr(params, zero_page_addr)
249 .map_err(|_| Error::ZeroPageSetup)?;
250 Ok(())
251 }
252
253 /// Add an e820 region to the e820 map.
254 /// Returns Ok(()) if successful, or an error if there is no space left in the map.
add_e820_entry(params: &mut boot_params, addr: u64, size: u64, mem_type: u32) -> Result<()>255 fn add_e820_entry(params: &mut boot_params, addr: u64, size: u64, mem_type: u32) -> Result<()> {
256 if params.e820_entries >= params.e820_table.len() as u8 {
257 return Err(Error::E820Configuration);
258 }
259
260 params.e820_table[params.e820_entries as usize].addr = addr;
261 params.e820_table[params.e820_entries as usize].size = size;
262 params.e820_table[params.e820_entries as usize].type_ = mem_type;
263 params.e820_entries += 1;
264
265 Ok(())
266 }
267
268 /// Returns a Vec of the valid memory addresses.
269 /// These should be used to configure the GuestMemory structure for the platform.
270 /// For x86_64 all addresses are valid from the start of the kernel except a
271 /// carve out at the end of 32bit address space.
arch_memory_regions(size: u64, has_bios: bool) -> Vec<(GuestAddress, u64)>272 fn arch_memory_regions(size: u64, has_bios: bool) -> Vec<(GuestAddress, u64)> {
273 let mem_end = GuestAddress(size);
274 let first_addr_past_32bits = GuestAddress(FIRST_ADDR_PAST_32BITS);
275 let end_32bit_gap_start = GuestAddress(FIRST_ADDR_PAST_32BITS - MEM_32BIT_GAP_SIZE);
276
277 let mut regions = Vec::new();
278 if mem_end < end_32bit_gap_start {
279 regions.push((GuestAddress(0), size));
280 if has_bios {
281 regions.push((GuestAddress(BIOS_START), BIOS_LEN as u64));
282 }
283 } else {
284 regions.push((GuestAddress(0), end_32bit_gap_start.offset()));
285 if mem_end > first_addr_past_32bits {
286 let region_start = if has_bios {
287 GuestAddress(BIOS_START)
288 } else {
289 first_addr_past_32bits
290 };
291 regions.push((region_start, mem_end.offset_from(first_addr_past_32bits)));
292 } else if has_bios {
293 regions.push((GuestAddress(BIOS_START), BIOS_LEN as u64));
294 }
295 }
296
297 regions
298 }
299
300 impl arch::LinuxArch for X8664arch {
301 type Error = Error;
302
build_vm<F, E>( mut components: VmComponents, split_irqchip: bool, serial_parameters: &BTreeMap<u8, SerialParameters>, create_devices: F, ) -> Result<RunnableLinuxVm> where F: FnOnce( &GuestMemory, &mut Vm, &mut SystemAllocator, &EventFd, ) -> std::result::Result<Vec<(Box<dyn PciDevice>, Option<Minijail>)>, E>, E: StdError + 'static,303 fn build_vm<F, E>(
304 mut components: VmComponents,
305 split_irqchip: bool,
306 serial_parameters: &BTreeMap<u8, SerialParameters>,
307 create_devices: F,
308 ) -> Result<RunnableLinuxVm>
309 where
310 F: FnOnce(
311 &GuestMemory,
312 &mut Vm,
313 &mut SystemAllocator,
314 &EventFd,
315 ) -> std::result::Result<Vec<(Box<dyn PciDevice>, Option<Minijail>)>, E>,
316 E: StdError + 'static,
317 {
318 let mut resources =
319 Self::get_resource_allocator(components.memory_size, components.wayland_dmabuf);
320 let has_bios = match components.vm_image {
321 VmImage::Bios(_) => true,
322 _ => false,
323 };
324 let mem = Self::setup_memory(components.memory_size, has_bios)?;
325 let kvm = Kvm::new().map_err(Error::CreateKvm)?;
326 let mut vm = Self::create_vm(&kvm, split_irqchip, mem.clone())?;
327
328 let vcpu_count = components.vcpu_count;
329 let mut vcpus = Vec::with_capacity(vcpu_count as usize);
330 for cpu_id in 0..vcpu_count {
331 let vcpu = Vcpu::new(cpu_id as libc::c_ulong, &kvm, &vm).map_err(Error::CreateVcpu)?;
332 if let VmImage::Kernel(_) = components.vm_image {
333 Self::configure_vcpu(
334 vm.get_memory(),
335 &kvm,
336 &vm,
337 &vcpu,
338 cpu_id as u64,
339 vcpu_count as u64,
340 )?;
341 }
342 vcpus.push(vcpu);
343 }
344
345 let vcpu_affinity = components.vcpu_affinity;
346
347 let irq_chip = Self::create_irq_chip(&vm)?;
348
349 let mut mmio_bus = devices::Bus::new();
350
351 let exit_evt = EventFd::new().map_err(Error::CreateEventFd)?;
352
353 let pci_devices = create_devices(&mem, &mut vm, &mut resources, &exit_evt)
354 .map_err(|e| Error::CreateDevices(Box::new(e)))?;
355 let (pci, pci_irqs, pid_debug_label_map) =
356 arch::generate_pci_root(pci_devices, &mut mmio_bus, &mut resources, &mut vm)
357 .map_err(Error::CreatePciRoot)?;
358 let pci_bus = Arc::new(Mutex::new(PciConfigIo::new(pci)));
359
360 let mut io_bus = Self::setup_io_bus(
361 &mut vm,
362 split_irqchip,
363 exit_evt.try_clone().map_err(Error::CloneEventFd)?,
364 Some(pci_bus.clone()),
365 components.memory_size,
366 )?;
367
368 let (stdio_serial_num, stdio_serial) =
369 Self::setup_serial_devices(&mut vm, &mut io_bus, &serial_parameters)?;
370
371 match components.vm_image {
372 VmImage::Bios(ref mut bios) => Self::load_bios(&mem, bios)?,
373 VmImage::Kernel(ref mut kernel_image) => {
374 let mut cmdline = Self::get_base_linux_cmdline(stdio_serial_num);
375 for param in components.extra_kernel_params {
376 cmdline.insert_str(¶m).map_err(Error::Cmdline)?;
377 }
378
379 // separate out load_kernel from other setup to get a specific error for
380 // kernel loading
381 let (params, kernel_end) = Self::load_kernel(&mem, kernel_image)?;
382
383 Self::setup_system_memory(
384 &mem,
385 components.memory_size,
386 vcpu_count,
387 &CString::new(cmdline).unwrap(),
388 components.initrd_image,
389 pci_irqs,
390 components.android_fstab,
391 kernel_end,
392 params,
393 )?;
394 }
395 }
396 Ok(RunnableLinuxVm {
397 vm,
398 kvm,
399 resources,
400 stdio_serial,
401 exit_evt,
402 vcpus,
403 vcpu_affinity,
404 irq_chip,
405 io_bus,
406 mmio_bus,
407 pid_debug_label_map,
408 })
409 }
410 }
411
412 impl X8664arch {
413 /// Loads the bios from an open file.
414 ///
415 /// # Arguments
416 ///
417 /// * `mem` - The memory to be used by the guest.
418 /// * `bios_image` - the File object for the specified bios
load_bios(mem: &GuestMemory, bios_image: &mut File) -> Result<()>419 fn load_bios(mem: &GuestMemory, bios_image: &mut File) -> Result<()> {
420 let bios_image_length = bios_image
421 .seek(io::SeekFrom::End(0))
422 .map_err(Error::LoadBios)?;
423 if bios_image_length != BIOS_LEN as u64 {
424 return Err(Error::LoadBios(io::Error::new(
425 io::ErrorKind::InvalidData,
426 format!(
427 "bios was {} bytes, expected {}",
428 bios_image_length, BIOS_LEN
429 ),
430 )));
431 }
432 bios_image
433 .seek(io::SeekFrom::Start(0))
434 .map_err(Error::LoadBios)?;
435 mem.read_to_memory(GuestAddress(BIOS_START), bios_image, BIOS_LEN)
436 .map_err(Error::SetupGuestMemory)?;
437 Ok(())
438 }
439
440 /// Loads the kernel from an open file.
441 ///
442 /// # Arguments
443 ///
444 /// * `mem` - The memory to be used by the guest.
445 /// * `kernel_image` - the File object for the specified kernel.
load_kernel(mem: &GuestMemory, kernel_image: &mut File) -> Result<(boot_params, u64)>446 fn load_kernel(mem: &GuestMemory, kernel_image: &mut File) -> Result<(boot_params, u64)> {
447 let elf_result =
448 kernel_loader::load_kernel(mem, GuestAddress(KERNEL_START_OFFSET), kernel_image);
449 if elf_result == Err(kernel_loader::Error::InvalidElfMagicNumber) {
450 bzimage::load_bzimage(mem, GuestAddress(KERNEL_START_OFFSET), kernel_image)
451 .map_err(Error::LoadBzImage)
452 } else {
453 let kernel_end = elf_result.map_err(Error::LoadKernel)?;
454 Ok((Default::default(), kernel_end))
455 }
456 }
457
458 /// Configures the system memory space should be called once per vm before
459 /// starting vcpu threads.
460 ///
461 /// # Arguments
462 ///
463 /// * `mem` - The memory to be used by the guest.
464 /// * `vcpu_count` - Number of virtual CPUs the guest will have.
465 /// * `cmdline` - the kernel commandline
466 /// * `initrd_file` - an initial ramdisk image
setup_system_memory( mem: &GuestMemory, mem_size: u64, vcpu_count: u32, cmdline: &CStr, initrd_file: Option<File>, pci_irqs: Vec<(u32, PciInterruptPin)>, android_fstab: Option<File>, kernel_end: u64, params: boot_params, ) -> Result<()>467 fn setup_system_memory(
468 mem: &GuestMemory,
469 mem_size: u64,
470 vcpu_count: u32,
471 cmdline: &CStr,
472 initrd_file: Option<File>,
473 pci_irqs: Vec<(u32, PciInterruptPin)>,
474 android_fstab: Option<File>,
475 kernel_end: u64,
476 params: boot_params,
477 ) -> Result<()> {
478 kernel_loader::load_cmdline(mem, GuestAddress(CMDLINE_OFFSET), cmdline)
479 .map_err(Error::LoadCmdline)?;
480
481 // Track the first free address after the kernel - this is where extra
482 // data like the device tree blob and initrd will be loaded.
483 let mut free_addr = kernel_end;
484
485 let setup_data = if let Some(android_fstab) = android_fstab {
486 let free_addr_aligned = (((free_addr + 64 - 1) / 64) * 64) + 64;
487 let dtb_start = GuestAddress(free_addr_aligned);
488 let dtb_size = fdt::create_fdt(
489 X86_64_FDT_MAX_SIZE as usize,
490 mem,
491 dtb_start.offset(),
492 android_fstab,
493 )
494 .map_err(Error::CreateFdt)?;
495 free_addr = dtb_start.offset() + dtb_size as u64;
496 Some(dtb_start)
497 } else {
498 None
499 };
500
501 let initrd = match initrd_file {
502 Some(mut initrd_file) => {
503 let mut initrd_addr_max = u64::from(params.hdr.initrd_addr_max);
504 // Default initrd_addr_max for old kernels (see Documentation/x86/boot.txt).
505 if initrd_addr_max == 0 {
506 initrd_addr_max = 0x37FFFFFF;
507 }
508
509 let mem_max = mem.end_addr().offset() - 1;
510 if initrd_addr_max > mem_max {
511 initrd_addr_max = mem_max;
512 }
513
514 let (initrd_start, initrd_size) = arch::load_image_high(
515 mem,
516 &mut initrd_file,
517 GuestAddress(free_addr),
518 GuestAddress(initrd_addr_max),
519 sys_util::pagesize() as u64,
520 )
521 .map_err(Error::LoadInitrd)?;
522 Some((initrd_start, initrd_size))
523 }
524 None => None,
525 };
526
527 configure_system(
528 mem,
529 mem_size,
530 GuestAddress(KERNEL_START_OFFSET),
531 GuestAddress(CMDLINE_OFFSET),
532 cmdline.to_bytes().len() + 1,
533 vcpu_count as u8,
534 pci_irqs,
535 setup_data,
536 initrd,
537 params,
538 )?;
539 Ok(())
540 }
541
542 /// Creates a new VM object and initializes architecture specific devices
543 ///
544 /// # Arguments
545 ///
546 /// * `kvm` - The opened /dev/kvm object.
547 /// * `split_irqchip` - Whether to use a split IRQ chip.
548 /// * `mem` - The memory to be used by the guest.
create_vm(kvm: &Kvm, split_irqchip: bool, mem: GuestMemory) -> Result<Vm>549 fn create_vm(kvm: &Kvm, split_irqchip: bool, mem: GuestMemory) -> Result<Vm> {
550 let vm = Vm::new(&kvm, mem).map_err(Error::CreateVm)?;
551 let tss_addr = GuestAddress(0xfffbd000);
552 vm.set_tss_addr(tss_addr).map_err(Error::SetTssAddr)?;
553 if !split_irqchip {
554 vm.create_pit().map_err(Error::CreatePit)?;
555 vm.create_irq_chip().map_err(Error::CreateIrqChip)?;
556 }
557 Ok(vm)
558 }
559
560 /// This creates a GuestMemory object for this VM
561 ///
562 /// * `mem_size` - Desired physical memory size in bytes for this VM
setup_memory(mem_size: u64, has_bios: bool) -> Result<GuestMemory>563 fn setup_memory(mem_size: u64, has_bios: bool) -> Result<GuestMemory> {
564 let arch_mem_regions = arch_memory_regions(mem_size, has_bios);
565 let mem = GuestMemory::new(&arch_mem_regions).map_err(Error::SetupGuestMemory)?;
566 Ok(mem)
567 }
568
569 /// The creates the interrupt controller device and optionally returns the fd for it.
570 /// Some architectures may not have a separate descriptor for the interrupt
571 /// controller, so they would return None even on success.
572 ///
573 /// # Arguments
574 ///
575 /// * `vm` - the vm object
create_irq_chip(_vm: &kvm::Vm) -> Result<Option<File>>576 fn create_irq_chip(_vm: &kvm::Vm) -> Result<Option<File>> {
577 // Unfortunately X86 and ARM have to do this in completely different order
578 // X86 needs to create the irq chip before creating cpus and
579 // ARM needs to do it afterwards.
580 Ok(None)
581 }
582
583 /// This returns the first page frame number for use by the balloon driver.
584 ///
585 /// # Arguments
586 ///
587 /// * `mem_size` - the size in bytes of physical ram for the guest
get_base_dev_pfn(mem_size: u64) -> u64588 fn get_base_dev_pfn(mem_size: u64) -> u64 {
589 // Put device memory at a 2MB boundary after physical memory or 4gb, whichever is greater.
590 const MB: u64 = 1024 * 1024;
591 const GB: u64 = 1024 * MB;
592 let mem_size_round_2mb = (mem_size + 2 * MB - 1) / (2 * MB) * (2 * MB);
593 std::cmp::max(mem_size_round_2mb, 4 * GB) / sys_util::pagesize() as u64
594 }
595
596 /// This returns a minimal kernel command for this architecture
get_base_linux_cmdline(stdio_serial_num: Option<u8>) -> kernel_cmdline::Cmdline597 fn get_base_linux_cmdline(stdio_serial_num: Option<u8>) -> kernel_cmdline::Cmdline {
598 let mut cmdline = kernel_cmdline::Cmdline::new(CMDLINE_MAX_SIZE as usize);
599 if stdio_serial_num.is_some() {
600 let tty_string = get_serial_tty_string(stdio_serial_num.unwrap());
601 cmdline.insert("console", &tty_string).unwrap();
602 }
603 cmdline.insert_str("noacpi reboot=k panic=-1").unwrap();
604
605 cmdline
606 }
607
608 /// Returns a system resource allocator.
get_resource_allocator(mem_size: u64, gpu_allocation: bool) -> SystemAllocator609 fn get_resource_allocator(mem_size: u64, gpu_allocation: bool) -> SystemAllocator {
610 const MMIO_BASE: u64 = 0xe0000000;
611 let device_addr_start = Self::get_base_dev_pfn(mem_size) * sys_util::pagesize() as u64;
612 SystemAllocator::builder()
613 .add_io_addresses(0xc000, 0x10000)
614 .add_mmio_addresses(MMIO_BASE, 0x100000)
615 .add_device_addresses(device_addr_start, u64::max_value() - device_addr_start)
616 .create_allocator(X86_64_IRQ_BASE, gpu_allocation)
617 .unwrap()
618 }
619
620 /// Sets up the IO bus for this platform
621 ///
622 /// # Arguments
623 ///
624 /// * - `vm` the vm object
625 /// * - `split_irqchip`: whether to use a split IRQ chip (i.e. userspace PIT/PIC/IOAPIC)
626 /// * - `exit_evt` - the event fd object which should receive exit events
627 /// * - `mem_size` - the size in bytes of physical ram for the guest
setup_io_bus( vm: &mut Vm, split_irqchip: bool, exit_evt: EventFd, pci: Option<Arc<Mutex<devices::PciConfigIo>>>, mem_size: u64, ) -> Result<(devices::Bus)>628 fn setup_io_bus(
629 vm: &mut Vm,
630 split_irqchip: bool,
631 exit_evt: EventFd,
632 pci: Option<Arc<Mutex<devices::PciConfigIo>>>,
633 mem_size: u64,
634 ) -> Result<(devices::Bus)> {
635 struct NoDevice;
636 impl devices::BusDevice for NoDevice {
637 fn debug_label(&self) -> String {
638 "no device".to_owned()
639 }
640 }
641
642 let mut io_bus = devices::Bus::new();
643
644 let mem_gap_start = FIRST_ADDR_PAST_32BITS - MEM_32BIT_GAP_SIZE;
645 let mem_below_4g = std::cmp::min(mem_gap_start, mem_size);
646 let mem_above_4g = mem_size.saturating_sub(FIRST_ADDR_PAST_32BITS);
647
648 io_bus
649 .insert(
650 Arc::new(Mutex::new(devices::Cmos::new(mem_below_4g, mem_above_4g))),
651 0x70,
652 0x2,
653 false,
654 )
655 .unwrap();
656 io_bus
657 .insert(
658 Arc::new(Mutex::new(devices::I8042Device::new(
659 exit_evt.try_clone().map_err(Error::CloneEventFd)?,
660 ))),
661 0x061,
662 0x4,
663 false,
664 )
665 .unwrap();
666
667 let nul_device = Arc::new(Mutex::new(NoDevice));
668 if split_irqchip {
669 let pit_evt = EventFd::new().map_err(Error::CreateEventFd)?;
670 let pit = Arc::new(Mutex::new(
671 devices::Pit::new(
672 pit_evt.try_clone().map_err(Error::CloneEventFd)?,
673 Arc::new(Mutex::new(Clock::new())),
674 )
675 .map_err(Error::CreatePitDevice)?,
676 ));
677 // Reserve from 0x40 to 0x61 (the speaker).
678 io_bus.insert(pit.clone(), 0x040, 0x22, false).unwrap();
679 vm.register_irqfd(&pit_evt, 0)
680 .map_err(Error::RegisterIrqfd)?;
681 } else {
682 io_bus
683 .insert(nul_device.clone(), 0x040, 0x8, false)
684 .unwrap(); // ignore pit
685 }
686
687 io_bus
688 .insert(nul_device.clone(), 0x0ed, 0x1, false)
689 .unwrap(); // most likely this one does nothing
690 io_bus
691 .insert(nul_device.clone(), 0x0f0, 0x2, false)
692 .unwrap(); // ignore fpu
693
694 if let Some(pci_root) = pci {
695 io_bus.insert(pci_root, 0xcf8, 0x8, false).unwrap();
696 } else {
697 // ignore pci.
698 io_bus
699 .insert(nul_device.clone(), 0xcf8, 0x8, false)
700 .unwrap();
701 }
702
703 Ok(io_bus)
704 }
705
706 /// Sets up the serial devices for this platform. Returns the serial port number and serial
707 /// device to be used for stdout
708 ///
709 /// # Arguments
710 ///
711 /// * - `vm` the vm object
712 /// * - `io_bus` the I/O bus to add the devices to
713 /// * - `serial_parmaters` - definitions for how the serial devices should be configured
setup_serial_devices( vm: &mut Vm, io_bus: &mut devices::Bus, serial_parameters: &BTreeMap<u8, SerialParameters>, ) -> Result<(Option<u8>, Option<Arc<Mutex<devices::Serial>>>)>714 fn setup_serial_devices(
715 vm: &mut Vm,
716 io_bus: &mut devices::Bus,
717 serial_parameters: &BTreeMap<u8, SerialParameters>,
718 ) -> Result<(Option<u8>, Option<Arc<Mutex<devices::Serial>>>)> {
719 let com_evt_1_3 = EventFd::new().map_err(Error::CreateEventFd)?;
720 let com_evt_2_4 = EventFd::new().map_err(Error::CreateEventFd)?;
721
722 let (stdio_serial_num, stdio_serial) =
723 arch::add_serial_devices(io_bus, &com_evt_1_3, &com_evt_2_4, &serial_parameters)
724 .map_err(Error::CreateSerialDevices)?;
725
726 vm.register_irqfd(&com_evt_1_3, X86_64_SERIAL_1_3_IRQ)
727 .map_err(Error::RegisterIrqfd)?;
728 vm.register_irqfd(&com_evt_2_4, X86_64_SERIAL_2_4_IRQ)
729 .map_err(Error::RegisterIrqfd)?;
730
731 Ok((stdio_serial_num, stdio_serial))
732 }
733
734 /// Configures the vcpu and should be called once per vcpu from the vcpu's thread.
735 ///
736 /// # Arguments
737 ///
738 /// * `guest_mem` - The memory to be used by the guest.
739 /// * `kernel_load_offset` - Offset in bytes from `guest_mem` at which the
740 /// kernel starts.
741 /// * `kvm` - The /dev/kvm object that created vcpu.
742 /// * `vm` - The VM object associated with this VCPU.
743 /// * `vcpu` - The VCPU object to configure.
744 /// * `cpu_id` - The id of the given `vcpu`.
745 /// * `num_cpus` - Number of virtual CPUs the guest will have.
configure_vcpu( guest_mem: &GuestMemory, kvm: &Kvm, _vm: &Vm, vcpu: &Vcpu, cpu_id: u64, num_cpus: u64, ) -> Result<()>746 fn configure_vcpu(
747 guest_mem: &GuestMemory,
748 kvm: &Kvm,
749 _vm: &Vm,
750 vcpu: &Vcpu,
751 cpu_id: u64,
752 num_cpus: u64,
753 ) -> Result<()> {
754 let kernel_load_addr = GuestAddress(KERNEL_START_OFFSET);
755 cpuid::setup_cpuid(kvm, vcpu, cpu_id, num_cpus).map_err(Error::SetupCpuid)?;
756 regs::setup_msrs(vcpu).map_err(Error::SetupMsrs)?;
757 let kernel_end = guest_mem
758 .checked_offset(kernel_load_addr, KERNEL_64BIT_ENTRY_OFFSET)
759 .ok_or(Error::KernelOffsetPastEnd)?;
760 regs::setup_regs(
761 vcpu,
762 (kernel_end).offset() as u64,
763 BOOT_STACK_POINTER as u64,
764 ZERO_PAGE_OFFSET as u64,
765 )
766 .map_err(Error::SetupRegs)?;
767 regs::setup_fpu(vcpu).map_err(Error::SetupFpu)?;
768 regs::setup_sregs(guest_mem, vcpu).map_err(Error::SetupSregs)?;
769 interrupts::set_lint(vcpu).map_err(Error::SetLint)?;
770 Ok(())
771 }
772 }
773 #[cfg(test)]
774 mod tests {
775 use super::*;
776
777 #[test]
regions_lt_4gb_nobios()778 fn regions_lt_4gb_nobios() {
779 let regions = arch_memory_regions(1u64 << 29, /* has_bios */ false);
780 assert_eq!(1, regions.len());
781 assert_eq!(GuestAddress(0), regions[0].0);
782 assert_eq!(1u64 << 29, regions[0].1);
783 }
784
785 #[test]
regions_gt_4gb_nobios()786 fn regions_gt_4gb_nobios() {
787 let regions = arch_memory_regions((1u64 << 32) + 0x8000, /* has_bios */ false);
788 assert_eq!(2, regions.len());
789 assert_eq!(GuestAddress(0), regions[0].0);
790 assert_eq!(GuestAddress(1u64 << 32), regions[1].0);
791 }
792
793 #[test]
regions_lt_4gb_bios()794 fn regions_lt_4gb_bios() {
795 let regions = arch_memory_regions(1u64 << 29, /* has_bios */ true);
796 assert_eq!(2, regions.len());
797 assert_eq!(GuestAddress(0), regions[0].0);
798 assert_eq!(1u64 << 29, regions[0].1);
799 assert_eq!(GuestAddress(BIOS_START), regions[1].0);
800 assert_eq!(BIOS_LEN as u64, regions[1].1);
801 }
802
803 #[test]
regions_gt_4gb_bios()804 fn regions_gt_4gb_bios() {
805 let regions = arch_memory_regions((1u64 << 32) + 0x8000, /* has_bios */ true);
806 assert_eq!(2, regions.len());
807 assert_eq!(GuestAddress(0), regions[0].0);
808 assert_eq!(GuestAddress(BIOS_START), regions[1].0);
809 }
810 }
811