• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2017 The Chromium OS Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 mod fdt;
6 
7 const E820_RAM: u32 = 1;
8 const SETUP_DTB: u32 = 2;
9 const X86_64_FDT_MAX_SIZE: u64 = 0x200000;
10 
11 #[allow(dead_code)]
12 #[allow(non_upper_case_globals)]
13 #[allow(non_camel_case_types)]
14 #[allow(non_snake_case)]
15 mod bootparam;
16 
17 // boot_params is just a series of ints, it is safe to initialize it.
18 unsafe impl data_model::DataInit for bootparam::boot_params {}
19 
20 #[allow(dead_code)]
21 #[allow(non_upper_case_globals)]
22 mod msr_index;
23 
24 #[allow(dead_code)]
25 #[allow(non_upper_case_globals)]
26 #[allow(non_camel_case_types)]
27 #[allow(clippy::all)]
28 mod mpspec;
29 // These mpspec types are only data, reading them from data is a safe initialization.
30 unsafe impl data_model::DataInit for mpspec::mpc_bus {}
31 unsafe impl data_model::DataInit for mpspec::mpc_cpu {}
32 unsafe impl data_model::DataInit for mpspec::mpc_intsrc {}
33 unsafe impl data_model::DataInit for mpspec::mpc_ioapic {}
34 unsafe impl data_model::DataInit for mpspec::mpc_table {}
35 unsafe impl data_model::DataInit for mpspec::mpc_lintsrc {}
36 unsafe impl data_model::DataInit for mpspec::mpf_intel {}
37 
38 mod bzimage;
39 mod cpuid;
40 mod gdt;
41 mod interrupts;
42 mod mptable;
43 mod regs;
44 mod smbios;
45 
46 use std::collections::BTreeMap;
47 use std::error::Error as StdError;
48 use std::ffi::{CStr, CString};
49 use std::fmt::{self, Display};
50 use std::fs::File;
51 use std::io::{self, Seek};
52 use std::mem;
53 use std::sync::Arc;
54 
55 use crate::bootparam::boot_params;
56 use arch::{RunnableLinuxVm, VmComponents, VmImage};
57 use devices::{get_serial_tty_string, PciConfigIo, PciDevice, PciInterruptPin, SerialParameters};
58 use io_jail::Minijail;
59 use kvm::*;
60 use remain::sorted;
61 use resources::SystemAllocator;
62 use sync::Mutex;
63 use sys_util::{Clock, EventFd, GuestAddress, GuestMemory, GuestMemoryError};
64 
65 #[sorted]
66 #[derive(Debug)]
67 pub enum Error {
68     CloneEventFd(sys_util::Error),
69     Cmdline(kernel_cmdline::Error),
70     ConfigureSystem,
71     CreateDevices(Box<dyn StdError>),
72     CreateEventFd(sys_util::Error),
73     CreateFdt(arch::fdt::Error),
74     CreateIrqChip(sys_util::Error),
75     CreateKvm(sys_util::Error),
76     CreatePciRoot(arch::DeviceRegistrationError),
77     CreatePit(sys_util::Error),
78     CreatePitDevice(devices::PitError),
79     CreateSerialDevices(arch::DeviceRegistrationError),
80     CreateSocket(io::Error),
81     CreateVcpu(sys_util::Error),
82     CreateVm(sys_util::Error),
83     E820Configuration,
84     KernelOffsetPastEnd,
85     LoadBios(io::Error),
86     LoadBzImage(bzimage::Error),
87     LoadCmdline(kernel_loader::Error),
88     LoadInitrd(arch::LoadImageError),
89     LoadKernel(kernel_loader::Error),
90     RegisterIrqfd(sys_util::Error),
91     RegisterVsock(arch::DeviceRegistrationError),
92     SetLint(interrupts::Error),
93     SetTssAddr(sys_util::Error),
94     SetupCpuid(cpuid::Error),
95     SetupFpu(regs::Error),
96     SetupGuestMemory(GuestMemoryError),
97     SetupMptable(mptable::Error),
98     SetupMsrs(regs::Error),
99     SetupRegs(regs::Error),
100     SetupSmbios(smbios::Error),
101     SetupSregs(regs::Error),
102     ZeroPagePastRamEnd,
103     ZeroPageSetup,
104 }
105 
106 impl Display for Error {
107     #[remain::check]
fmt(&self, f: &mut fmt::Formatter) -> fmt::Result108     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
109         use self::Error::*;
110 
111         #[sorted]
112         match self {
113             CloneEventFd(e) => write!(f, "unable to clone an EventFd: {}", e),
114             Cmdline(e) => write!(f, "the given kernel command line was invalid: {}", e),
115             ConfigureSystem => write!(f, "error configuring the system"),
116             CreateDevices(e) => write!(f, "error creating devices: {}", e),
117             CreateEventFd(e) => write!(f, "unable to make an EventFd: {}", e),
118             CreateFdt(e) => write!(f, "failed to create fdt: {}", e),
119             CreateIrqChip(e) => write!(f, "failed to create irq chip: {}", e),
120             CreateKvm(e) => write!(f, "failed to open /dev/kvm: {}", e),
121             CreatePciRoot(e) => write!(f, "failed to create a PCI root hub: {}", e),
122             CreatePit(e) => write!(f, "unable to create PIT: {}", e),
123             CreatePitDevice(e) => write!(f, "unable to make PIT device: {}", e),
124             CreateSerialDevices(e) => write!(f, "unable to create serial devices: {}", e),
125             CreateSocket(e) => write!(f, "failed to create socket: {}", e),
126             CreateVcpu(e) => write!(f, "failed to create VCPU: {}", e),
127             CreateVm(e) => write!(f, "failed to create VM: {}", e),
128             E820Configuration => write!(f, "invalid e820 setup params"),
129             KernelOffsetPastEnd => write!(f, "the kernel extends past the end of RAM"),
130             LoadBios(e) => write!(f, "error loading bios: {}", e),
131             LoadBzImage(e) => write!(f, "error loading kernel bzImage: {}", e),
132             LoadCmdline(e) => write!(f, "error loading command line: {}", e),
133             LoadInitrd(e) => write!(f, "error loading initrd: {}", e),
134             LoadKernel(e) => write!(f, "error loading Kernel: {}", e),
135             RegisterIrqfd(e) => write!(f, "error registering an IrqFd: {}", e),
136             RegisterVsock(e) => write!(f, "error registering virtual socket device: {}", e),
137             SetLint(e) => write!(f, "failed to set interrupts: {}", e),
138             SetTssAddr(e) => write!(f, "failed to set tss addr: {}", e),
139             SetupCpuid(e) => write!(f, "failed to set up cpuid: {}", e),
140             SetupFpu(e) => write!(f, "failed to set up FPU: {}", e),
141             SetupGuestMemory(e) => write!(f, "failed to set up guest memory: {}", e),
142             SetupMptable(e) => write!(f, "failed to set up mptable: {}", e),
143             SetupMsrs(e) => write!(f, "failed to set up MSRs: {}", e),
144             SetupRegs(e) => write!(f, "failed to set up registers: {}", e),
145             SetupSmbios(e) => write!(f, "failed to set up SMBIOS: {}", e),
146             SetupSregs(e) => write!(f, "failed to set up sregs: {}", e),
147             ZeroPagePastRamEnd => write!(f, "the zero page extends past the end of guest_mem"),
148             ZeroPageSetup => write!(f, "error writing the zero page of guest memory"),
149         }
150     }
151 }
152 
153 pub type Result<T> = std::result::Result<T, Error>;
154 
155 impl std::error::Error for Error {}
156 
157 pub struct X8664arch;
158 
159 const BOOT_STACK_POINTER: u64 = 0x8000;
160 const MEM_32BIT_GAP_SIZE: u64 = (768 << 20);
161 const FIRST_ADDR_PAST_32BITS: u64 = (1 << 32);
162 const KERNEL_64BIT_ENTRY_OFFSET: u64 = 0x200;
163 const ZERO_PAGE_OFFSET: u64 = 0x7000;
164 /// The x86 reset vector for i386+ and x86_64 puts the processor into an "unreal mode" where it
165 /// can access the last 1 MB of the 32-bit address space in 16-bit mode, and starts the instruction
166 /// pointer at the effective physical address 0xFFFFFFF0.
167 const BIOS_LEN: usize = 1 << 20;
168 const BIOS_START: u64 = FIRST_ADDR_PAST_32BITS - (BIOS_LEN as u64);
169 
170 const KERNEL_START_OFFSET: u64 = 0x200000;
171 const CMDLINE_OFFSET: u64 = 0x20000;
172 const CMDLINE_MAX_SIZE: u64 = KERNEL_START_OFFSET - CMDLINE_OFFSET;
173 const X86_64_SERIAL_1_3_IRQ: u32 = 4;
174 const X86_64_SERIAL_2_4_IRQ: u32 = 3;
175 const X86_64_IRQ_BASE: u32 = 5;
176 
configure_system( guest_mem: &GuestMemory, _mem_size: u64, kernel_addr: GuestAddress, cmdline_addr: GuestAddress, cmdline_size: usize, num_cpus: u8, pci_irqs: Vec<(u32, PciInterruptPin)>, setup_data: Option<GuestAddress>, initrd: Option<(GuestAddress, usize)>, mut params: boot_params, ) -> Result<()>177 fn configure_system(
178     guest_mem: &GuestMemory,
179     _mem_size: u64,
180     kernel_addr: GuestAddress,
181     cmdline_addr: GuestAddress,
182     cmdline_size: usize,
183     num_cpus: u8,
184     pci_irqs: Vec<(u32, PciInterruptPin)>,
185     setup_data: Option<GuestAddress>,
186     initrd: Option<(GuestAddress, usize)>,
187     mut params: boot_params,
188 ) -> Result<()> {
189     const EBDA_START: u64 = 0x0009fc00;
190     const KERNEL_BOOT_FLAG_MAGIC: u16 = 0xaa55;
191     const KERNEL_HDR_MAGIC: u32 = 0x53726448;
192     const KERNEL_LOADER_OTHER: u8 = 0xff;
193     const KERNEL_MIN_ALIGNMENT_BYTES: u32 = 0x1000000; // Must be non-zero.
194     let first_addr_past_32bits = GuestAddress(FIRST_ADDR_PAST_32BITS);
195     let end_32bit_gap_start = GuestAddress(FIRST_ADDR_PAST_32BITS - MEM_32BIT_GAP_SIZE);
196 
197     // Note that this puts the mptable at 0x0 in guest physical memory.
198     mptable::setup_mptable(guest_mem, num_cpus, pci_irqs).map_err(Error::SetupMptable)?;
199 
200     smbios::setup_smbios(guest_mem).map_err(Error::SetupSmbios)?;
201 
202     params.hdr.type_of_loader = KERNEL_LOADER_OTHER;
203     params.hdr.boot_flag = KERNEL_BOOT_FLAG_MAGIC;
204     params.hdr.header = KERNEL_HDR_MAGIC;
205     params.hdr.cmd_line_ptr = cmdline_addr.offset() as u32;
206     params.hdr.cmdline_size = cmdline_size as u32;
207     params.hdr.kernel_alignment = KERNEL_MIN_ALIGNMENT_BYTES;
208     if let Some(setup_data) = setup_data {
209         params.hdr.setup_data = setup_data.offset();
210     }
211     if let Some((initrd_addr, initrd_size)) = initrd {
212         params.hdr.ramdisk_image = initrd_addr.offset() as u32;
213         params.hdr.ramdisk_size = initrd_size as u32;
214     }
215 
216     add_e820_entry(&mut params, 0, EBDA_START, E820_RAM)?;
217 
218     let mem_end = guest_mem.end_addr();
219     if mem_end < end_32bit_gap_start {
220         add_e820_entry(
221             &mut params,
222             kernel_addr.offset() as u64,
223             mem_end.offset_from(kernel_addr) as u64,
224             E820_RAM,
225         )?;
226     } else {
227         add_e820_entry(
228             &mut params,
229             kernel_addr.offset() as u64,
230             end_32bit_gap_start.offset_from(kernel_addr) as u64,
231             E820_RAM,
232         )?;
233         if mem_end > first_addr_past_32bits {
234             add_e820_entry(
235                 &mut params,
236                 first_addr_past_32bits.offset() as u64,
237                 mem_end.offset_from(first_addr_past_32bits) as u64,
238                 E820_RAM,
239             )?;
240         }
241     }
242 
243     let zero_page_addr = GuestAddress(ZERO_PAGE_OFFSET);
244     guest_mem
245         .checked_offset(zero_page_addr, mem::size_of::<boot_params>() as u64)
246         .ok_or(Error::ZeroPagePastRamEnd)?;
247     guest_mem
248         .write_obj_at_addr(params, zero_page_addr)
249         .map_err(|_| Error::ZeroPageSetup)?;
250     Ok(())
251 }
252 
253 /// Add an e820 region to the e820 map.
254 /// Returns Ok(()) if successful, or an error if there is no space left in the map.
add_e820_entry(params: &mut boot_params, addr: u64, size: u64, mem_type: u32) -> Result<()>255 fn add_e820_entry(params: &mut boot_params, addr: u64, size: u64, mem_type: u32) -> Result<()> {
256     if params.e820_entries >= params.e820_table.len() as u8 {
257         return Err(Error::E820Configuration);
258     }
259 
260     params.e820_table[params.e820_entries as usize].addr = addr;
261     params.e820_table[params.e820_entries as usize].size = size;
262     params.e820_table[params.e820_entries as usize].type_ = mem_type;
263     params.e820_entries += 1;
264 
265     Ok(())
266 }
267 
268 /// Returns a Vec of the valid memory addresses.
269 /// These should be used to configure the GuestMemory structure for the platform.
270 /// For x86_64 all addresses are valid from the start of the kernel except a
271 /// carve out at the end of 32bit address space.
arch_memory_regions(size: u64, has_bios: bool) -> Vec<(GuestAddress, u64)>272 fn arch_memory_regions(size: u64, has_bios: bool) -> Vec<(GuestAddress, u64)> {
273     let mem_end = GuestAddress(size);
274     let first_addr_past_32bits = GuestAddress(FIRST_ADDR_PAST_32BITS);
275     let end_32bit_gap_start = GuestAddress(FIRST_ADDR_PAST_32BITS - MEM_32BIT_GAP_SIZE);
276 
277     let mut regions = Vec::new();
278     if mem_end < end_32bit_gap_start {
279         regions.push((GuestAddress(0), size));
280         if has_bios {
281             regions.push((GuestAddress(BIOS_START), BIOS_LEN as u64));
282         }
283     } else {
284         regions.push((GuestAddress(0), end_32bit_gap_start.offset()));
285         if mem_end > first_addr_past_32bits {
286             let region_start = if has_bios {
287                 GuestAddress(BIOS_START)
288             } else {
289                 first_addr_past_32bits
290             };
291             regions.push((region_start, mem_end.offset_from(first_addr_past_32bits)));
292         } else if has_bios {
293             regions.push((GuestAddress(BIOS_START), BIOS_LEN as u64));
294         }
295     }
296 
297     regions
298 }
299 
300 impl arch::LinuxArch for X8664arch {
301     type Error = Error;
302 
build_vm<F, E>( mut components: VmComponents, split_irqchip: bool, serial_parameters: &BTreeMap<u8, SerialParameters>, create_devices: F, ) -> Result<RunnableLinuxVm> where F: FnOnce( &GuestMemory, &mut Vm, &mut SystemAllocator, &EventFd, ) -> std::result::Result<Vec<(Box<dyn PciDevice>, Option<Minijail>)>, E>, E: StdError + 'static,303     fn build_vm<F, E>(
304         mut components: VmComponents,
305         split_irqchip: bool,
306         serial_parameters: &BTreeMap<u8, SerialParameters>,
307         create_devices: F,
308     ) -> Result<RunnableLinuxVm>
309     where
310         F: FnOnce(
311             &GuestMemory,
312             &mut Vm,
313             &mut SystemAllocator,
314             &EventFd,
315         ) -> std::result::Result<Vec<(Box<dyn PciDevice>, Option<Minijail>)>, E>,
316         E: StdError + 'static,
317     {
318         let mut resources =
319             Self::get_resource_allocator(components.memory_size, components.wayland_dmabuf);
320         let has_bios = match components.vm_image {
321             VmImage::Bios(_) => true,
322             _ => false,
323         };
324         let mem = Self::setup_memory(components.memory_size, has_bios)?;
325         let kvm = Kvm::new().map_err(Error::CreateKvm)?;
326         let mut vm = Self::create_vm(&kvm, split_irqchip, mem.clone())?;
327 
328         let vcpu_count = components.vcpu_count;
329         let mut vcpus = Vec::with_capacity(vcpu_count as usize);
330         for cpu_id in 0..vcpu_count {
331             let vcpu = Vcpu::new(cpu_id as libc::c_ulong, &kvm, &vm).map_err(Error::CreateVcpu)?;
332             if let VmImage::Kernel(_) = components.vm_image {
333                 Self::configure_vcpu(
334                     vm.get_memory(),
335                     &kvm,
336                     &vm,
337                     &vcpu,
338                     cpu_id as u64,
339                     vcpu_count as u64,
340                 )?;
341             }
342             vcpus.push(vcpu);
343         }
344 
345         let vcpu_affinity = components.vcpu_affinity;
346 
347         let irq_chip = Self::create_irq_chip(&vm)?;
348 
349         let mut mmio_bus = devices::Bus::new();
350 
351         let exit_evt = EventFd::new().map_err(Error::CreateEventFd)?;
352 
353         let pci_devices = create_devices(&mem, &mut vm, &mut resources, &exit_evt)
354             .map_err(|e| Error::CreateDevices(Box::new(e)))?;
355         let (pci, pci_irqs, pid_debug_label_map) =
356             arch::generate_pci_root(pci_devices, &mut mmio_bus, &mut resources, &mut vm)
357                 .map_err(Error::CreatePciRoot)?;
358         let pci_bus = Arc::new(Mutex::new(PciConfigIo::new(pci)));
359 
360         let mut io_bus = Self::setup_io_bus(
361             &mut vm,
362             split_irqchip,
363             exit_evt.try_clone().map_err(Error::CloneEventFd)?,
364             Some(pci_bus.clone()),
365             components.memory_size,
366         )?;
367 
368         let (stdio_serial_num, stdio_serial) =
369             Self::setup_serial_devices(&mut vm, &mut io_bus, &serial_parameters)?;
370 
371         match components.vm_image {
372             VmImage::Bios(ref mut bios) => Self::load_bios(&mem, bios)?,
373             VmImage::Kernel(ref mut kernel_image) => {
374                 let mut cmdline = Self::get_base_linux_cmdline(stdio_serial_num);
375                 for param in components.extra_kernel_params {
376                     cmdline.insert_str(&param).map_err(Error::Cmdline)?;
377                 }
378 
379                 // separate out load_kernel from other setup to get a specific error for
380                 // kernel loading
381                 let (params, kernel_end) = Self::load_kernel(&mem, kernel_image)?;
382 
383                 Self::setup_system_memory(
384                     &mem,
385                     components.memory_size,
386                     vcpu_count,
387                     &CString::new(cmdline).unwrap(),
388                     components.initrd_image,
389                     pci_irqs,
390                     components.android_fstab,
391                     kernel_end,
392                     params,
393                 )?;
394             }
395         }
396         Ok(RunnableLinuxVm {
397             vm,
398             kvm,
399             resources,
400             stdio_serial,
401             exit_evt,
402             vcpus,
403             vcpu_affinity,
404             irq_chip,
405             io_bus,
406             mmio_bus,
407             pid_debug_label_map,
408         })
409     }
410 }
411 
412 impl X8664arch {
413     /// Loads the bios from an open file.
414     ///
415     /// # Arguments
416     ///
417     /// * `mem` - The memory to be used by the guest.
418     /// * `bios_image` - the File object for the specified bios
load_bios(mem: &GuestMemory, bios_image: &mut File) -> Result<()>419     fn load_bios(mem: &GuestMemory, bios_image: &mut File) -> Result<()> {
420         let bios_image_length = bios_image
421             .seek(io::SeekFrom::End(0))
422             .map_err(Error::LoadBios)?;
423         if bios_image_length != BIOS_LEN as u64 {
424             return Err(Error::LoadBios(io::Error::new(
425                 io::ErrorKind::InvalidData,
426                 format!(
427                     "bios was {} bytes, expected {}",
428                     bios_image_length, BIOS_LEN
429                 ),
430             )));
431         }
432         bios_image
433             .seek(io::SeekFrom::Start(0))
434             .map_err(Error::LoadBios)?;
435         mem.read_to_memory(GuestAddress(BIOS_START), bios_image, BIOS_LEN)
436             .map_err(Error::SetupGuestMemory)?;
437         Ok(())
438     }
439 
440     /// Loads the kernel from an open file.
441     ///
442     /// # Arguments
443     ///
444     /// * `mem` - The memory to be used by the guest.
445     /// * `kernel_image` - the File object for the specified kernel.
load_kernel(mem: &GuestMemory, kernel_image: &mut File) -> Result<(boot_params, u64)>446     fn load_kernel(mem: &GuestMemory, kernel_image: &mut File) -> Result<(boot_params, u64)> {
447         let elf_result =
448             kernel_loader::load_kernel(mem, GuestAddress(KERNEL_START_OFFSET), kernel_image);
449         if elf_result == Err(kernel_loader::Error::InvalidElfMagicNumber) {
450             bzimage::load_bzimage(mem, GuestAddress(KERNEL_START_OFFSET), kernel_image)
451                 .map_err(Error::LoadBzImage)
452         } else {
453             let kernel_end = elf_result.map_err(Error::LoadKernel)?;
454             Ok((Default::default(), kernel_end))
455         }
456     }
457 
458     /// Configures the system memory space should be called once per vm before
459     /// starting vcpu threads.
460     ///
461     /// # Arguments
462     ///
463     /// * `mem` - The memory to be used by the guest.
464     /// * `vcpu_count` - Number of virtual CPUs the guest will have.
465     /// * `cmdline` - the kernel commandline
466     /// * `initrd_file` - an initial ramdisk image
setup_system_memory( mem: &GuestMemory, mem_size: u64, vcpu_count: u32, cmdline: &CStr, initrd_file: Option<File>, pci_irqs: Vec<(u32, PciInterruptPin)>, android_fstab: Option<File>, kernel_end: u64, params: boot_params, ) -> Result<()>467     fn setup_system_memory(
468         mem: &GuestMemory,
469         mem_size: u64,
470         vcpu_count: u32,
471         cmdline: &CStr,
472         initrd_file: Option<File>,
473         pci_irqs: Vec<(u32, PciInterruptPin)>,
474         android_fstab: Option<File>,
475         kernel_end: u64,
476         params: boot_params,
477     ) -> Result<()> {
478         kernel_loader::load_cmdline(mem, GuestAddress(CMDLINE_OFFSET), cmdline)
479             .map_err(Error::LoadCmdline)?;
480 
481         // Track the first free address after the kernel - this is where extra
482         // data like the device tree blob and initrd will be loaded.
483         let mut free_addr = kernel_end;
484 
485         let setup_data = if let Some(android_fstab) = android_fstab {
486             let free_addr_aligned = (((free_addr + 64 - 1) / 64) * 64) + 64;
487             let dtb_start = GuestAddress(free_addr_aligned);
488             let dtb_size = fdt::create_fdt(
489                 X86_64_FDT_MAX_SIZE as usize,
490                 mem,
491                 dtb_start.offset(),
492                 android_fstab,
493             )
494             .map_err(Error::CreateFdt)?;
495             free_addr = dtb_start.offset() + dtb_size as u64;
496             Some(dtb_start)
497         } else {
498             None
499         };
500 
501         let initrd = match initrd_file {
502             Some(mut initrd_file) => {
503                 let mut initrd_addr_max = u64::from(params.hdr.initrd_addr_max);
504                 // Default initrd_addr_max for old kernels (see Documentation/x86/boot.txt).
505                 if initrd_addr_max == 0 {
506                     initrd_addr_max = 0x37FFFFFF;
507                 }
508 
509                 let mem_max = mem.end_addr().offset() - 1;
510                 if initrd_addr_max > mem_max {
511                     initrd_addr_max = mem_max;
512                 }
513 
514                 let (initrd_start, initrd_size) = arch::load_image_high(
515                     mem,
516                     &mut initrd_file,
517                     GuestAddress(free_addr),
518                     GuestAddress(initrd_addr_max),
519                     sys_util::pagesize() as u64,
520                 )
521                 .map_err(Error::LoadInitrd)?;
522                 Some((initrd_start, initrd_size))
523             }
524             None => None,
525         };
526 
527         configure_system(
528             mem,
529             mem_size,
530             GuestAddress(KERNEL_START_OFFSET),
531             GuestAddress(CMDLINE_OFFSET),
532             cmdline.to_bytes().len() + 1,
533             vcpu_count as u8,
534             pci_irqs,
535             setup_data,
536             initrd,
537             params,
538         )?;
539         Ok(())
540     }
541 
542     /// Creates a new VM object and initializes architecture specific devices
543     ///
544     /// # Arguments
545     ///
546     /// * `kvm` - The opened /dev/kvm object.
547     /// * `split_irqchip` - Whether to use a split IRQ chip.
548     /// * `mem` - The memory to be used by the guest.
create_vm(kvm: &Kvm, split_irqchip: bool, mem: GuestMemory) -> Result<Vm>549     fn create_vm(kvm: &Kvm, split_irqchip: bool, mem: GuestMemory) -> Result<Vm> {
550         let vm = Vm::new(&kvm, mem).map_err(Error::CreateVm)?;
551         let tss_addr = GuestAddress(0xfffbd000);
552         vm.set_tss_addr(tss_addr).map_err(Error::SetTssAddr)?;
553         if !split_irqchip {
554             vm.create_pit().map_err(Error::CreatePit)?;
555             vm.create_irq_chip().map_err(Error::CreateIrqChip)?;
556         }
557         Ok(vm)
558     }
559 
560     /// This creates a GuestMemory object for this VM
561     ///
562     /// * `mem_size` - Desired physical memory size in bytes for this VM
setup_memory(mem_size: u64, has_bios: bool) -> Result<GuestMemory>563     fn setup_memory(mem_size: u64, has_bios: bool) -> Result<GuestMemory> {
564         let arch_mem_regions = arch_memory_regions(mem_size, has_bios);
565         let mem = GuestMemory::new(&arch_mem_regions).map_err(Error::SetupGuestMemory)?;
566         Ok(mem)
567     }
568 
569     /// The creates the interrupt controller device and optionally returns the fd for it.
570     /// Some architectures may not have a separate descriptor for the interrupt
571     /// controller, so they would return None even on success.
572     ///
573     /// # Arguments
574     ///
575     /// * `vm` - the vm object
create_irq_chip(_vm: &kvm::Vm) -> Result<Option<File>>576     fn create_irq_chip(_vm: &kvm::Vm) -> Result<Option<File>> {
577         // Unfortunately X86 and ARM have to do this in completely different order
578         // X86 needs to create the irq chip before creating cpus and
579         // ARM needs to do it afterwards.
580         Ok(None)
581     }
582 
583     /// This returns the first page frame number for use by the balloon driver.
584     ///
585     /// # Arguments
586     ///
587     /// * `mem_size` - the size in bytes of physical ram for the guest
get_base_dev_pfn(mem_size: u64) -> u64588     fn get_base_dev_pfn(mem_size: u64) -> u64 {
589         // Put device memory at a 2MB boundary after physical memory or 4gb, whichever is greater.
590         const MB: u64 = 1024 * 1024;
591         const GB: u64 = 1024 * MB;
592         let mem_size_round_2mb = (mem_size + 2 * MB - 1) / (2 * MB) * (2 * MB);
593         std::cmp::max(mem_size_round_2mb, 4 * GB) / sys_util::pagesize() as u64
594     }
595 
596     /// This returns a minimal kernel command for this architecture
get_base_linux_cmdline(stdio_serial_num: Option<u8>) -> kernel_cmdline::Cmdline597     fn get_base_linux_cmdline(stdio_serial_num: Option<u8>) -> kernel_cmdline::Cmdline {
598         let mut cmdline = kernel_cmdline::Cmdline::new(CMDLINE_MAX_SIZE as usize);
599         if stdio_serial_num.is_some() {
600             let tty_string = get_serial_tty_string(stdio_serial_num.unwrap());
601             cmdline.insert("console", &tty_string).unwrap();
602         }
603         cmdline.insert_str("noacpi reboot=k panic=-1").unwrap();
604 
605         cmdline
606     }
607 
608     /// Returns a system resource allocator.
get_resource_allocator(mem_size: u64, gpu_allocation: bool) -> SystemAllocator609     fn get_resource_allocator(mem_size: u64, gpu_allocation: bool) -> SystemAllocator {
610         const MMIO_BASE: u64 = 0xe0000000;
611         let device_addr_start = Self::get_base_dev_pfn(mem_size) * sys_util::pagesize() as u64;
612         SystemAllocator::builder()
613             .add_io_addresses(0xc000, 0x10000)
614             .add_mmio_addresses(MMIO_BASE, 0x100000)
615             .add_device_addresses(device_addr_start, u64::max_value() - device_addr_start)
616             .create_allocator(X86_64_IRQ_BASE, gpu_allocation)
617             .unwrap()
618     }
619 
620     /// Sets up the IO bus for this platform
621     ///
622     /// # Arguments
623     ///
624     /// * - `vm` the vm object
625     /// * - `split_irqchip`: whether to use a split IRQ chip (i.e. userspace PIT/PIC/IOAPIC)
626     /// * - `exit_evt` - the event fd object which should receive exit events
627     /// * - `mem_size` - the size in bytes of physical ram for the guest
setup_io_bus( vm: &mut Vm, split_irqchip: bool, exit_evt: EventFd, pci: Option<Arc<Mutex<devices::PciConfigIo>>>, mem_size: u64, ) -> Result<(devices::Bus)>628     fn setup_io_bus(
629         vm: &mut Vm,
630         split_irqchip: bool,
631         exit_evt: EventFd,
632         pci: Option<Arc<Mutex<devices::PciConfigIo>>>,
633         mem_size: u64,
634     ) -> Result<(devices::Bus)> {
635         struct NoDevice;
636         impl devices::BusDevice for NoDevice {
637             fn debug_label(&self) -> String {
638                 "no device".to_owned()
639             }
640         }
641 
642         let mut io_bus = devices::Bus::new();
643 
644         let mem_gap_start = FIRST_ADDR_PAST_32BITS - MEM_32BIT_GAP_SIZE;
645         let mem_below_4g = std::cmp::min(mem_gap_start, mem_size);
646         let mem_above_4g = mem_size.saturating_sub(FIRST_ADDR_PAST_32BITS);
647 
648         io_bus
649             .insert(
650                 Arc::new(Mutex::new(devices::Cmos::new(mem_below_4g, mem_above_4g))),
651                 0x70,
652                 0x2,
653                 false,
654             )
655             .unwrap();
656         io_bus
657             .insert(
658                 Arc::new(Mutex::new(devices::I8042Device::new(
659                     exit_evt.try_clone().map_err(Error::CloneEventFd)?,
660                 ))),
661                 0x061,
662                 0x4,
663                 false,
664             )
665             .unwrap();
666 
667         let nul_device = Arc::new(Mutex::new(NoDevice));
668         if split_irqchip {
669             let pit_evt = EventFd::new().map_err(Error::CreateEventFd)?;
670             let pit = Arc::new(Mutex::new(
671                 devices::Pit::new(
672                     pit_evt.try_clone().map_err(Error::CloneEventFd)?,
673                     Arc::new(Mutex::new(Clock::new())),
674                 )
675                 .map_err(Error::CreatePitDevice)?,
676             ));
677             // Reserve from 0x40 to 0x61 (the speaker).
678             io_bus.insert(pit.clone(), 0x040, 0x22, false).unwrap();
679             vm.register_irqfd(&pit_evt, 0)
680                 .map_err(Error::RegisterIrqfd)?;
681         } else {
682             io_bus
683                 .insert(nul_device.clone(), 0x040, 0x8, false)
684                 .unwrap(); // ignore pit
685         }
686 
687         io_bus
688             .insert(nul_device.clone(), 0x0ed, 0x1, false)
689             .unwrap(); // most likely this one does nothing
690         io_bus
691             .insert(nul_device.clone(), 0x0f0, 0x2, false)
692             .unwrap(); // ignore fpu
693 
694         if let Some(pci_root) = pci {
695             io_bus.insert(pci_root, 0xcf8, 0x8, false).unwrap();
696         } else {
697             // ignore pci.
698             io_bus
699                 .insert(nul_device.clone(), 0xcf8, 0x8, false)
700                 .unwrap();
701         }
702 
703         Ok(io_bus)
704     }
705 
706     /// Sets up the serial devices for this platform. Returns the serial port number and serial
707     /// device to be used for stdout
708     ///
709     /// # Arguments
710     ///
711     /// * - `vm` the vm object
712     /// * - `io_bus` the I/O bus to add the devices to
713     /// * - `serial_parmaters` - definitions for how the serial devices should be configured
setup_serial_devices( vm: &mut Vm, io_bus: &mut devices::Bus, serial_parameters: &BTreeMap<u8, SerialParameters>, ) -> Result<(Option<u8>, Option<Arc<Mutex<devices::Serial>>>)>714     fn setup_serial_devices(
715         vm: &mut Vm,
716         io_bus: &mut devices::Bus,
717         serial_parameters: &BTreeMap<u8, SerialParameters>,
718     ) -> Result<(Option<u8>, Option<Arc<Mutex<devices::Serial>>>)> {
719         let com_evt_1_3 = EventFd::new().map_err(Error::CreateEventFd)?;
720         let com_evt_2_4 = EventFd::new().map_err(Error::CreateEventFd)?;
721 
722         let (stdio_serial_num, stdio_serial) =
723             arch::add_serial_devices(io_bus, &com_evt_1_3, &com_evt_2_4, &serial_parameters)
724                 .map_err(Error::CreateSerialDevices)?;
725 
726         vm.register_irqfd(&com_evt_1_3, X86_64_SERIAL_1_3_IRQ)
727             .map_err(Error::RegisterIrqfd)?;
728         vm.register_irqfd(&com_evt_2_4, X86_64_SERIAL_2_4_IRQ)
729             .map_err(Error::RegisterIrqfd)?;
730 
731         Ok((stdio_serial_num, stdio_serial))
732     }
733 
734     /// Configures the vcpu and should be called once per vcpu from the vcpu's thread.
735     ///
736     /// # Arguments
737     ///
738     /// * `guest_mem` - The memory to be used by the guest.
739     /// * `kernel_load_offset` - Offset in bytes from `guest_mem` at which the
740     ///                          kernel starts.
741     /// * `kvm` - The /dev/kvm object that created vcpu.
742     /// * `vm` - The VM object associated with this VCPU.
743     /// * `vcpu` - The VCPU object to configure.
744     /// * `cpu_id` - The id of the given `vcpu`.
745     /// * `num_cpus` - Number of virtual CPUs the guest will have.
configure_vcpu( guest_mem: &GuestMemory, kvm: &Kvm, _vm: &Vm, vcpu: &Vcpu, cpu_id: u64, num_cpus: u64, ) -> Result<()>746     fn configure_vcpu(
747         guest_mem: &GuestMemory,
748         kvm: &Kvm,
749         _vm: &Vm,
750         vcpu: &Vcpu,
751         cpu_id: u64,
752         num_cpus: u64,
753     ) -> Result<()> {
754         let kernel_load_addr = GuestAddress(KERNEL_START_OFFSET);
755         cpuid::setup_cpuid(kvm, vcpu, cpu_id, num_cpus).map_err(Error::SetupCpuid)?;
756         regs::setup_msrs(vcpu).map_err(Error::SetupMsrs)?;
757         let kernel_end = guest_mem
758             .checked_offset(kernel_load_addr, KERNEL_64BIT_ENTRY_OFFSET)
759             .ok_or(Error::KernelOffsetPastEnd)?;
760         regs::setup_regs(
761             vcpu,
762             (kernel_end).offset() as u64,
763             BOOT_STACK_POINTER as u64,
764             ZERO_PAGE_OFFSET as u64,
765         )
766         .map_err(Error::SetupRegs)?;
767         regs::setup_fpu(vcpu).map_err(Error::SetupFpu)?;
768         regs::setup_sregs(guest_mem, vcpu).map_err(Error::SetupSregs)?;
769         interrupts::set_lint(vcpu).map_err(Error::SetLint)?;
770         Ok(())
771     }
772 }
773 #[cfg(test)]
774 mod tests {
775     use super::*;
776 
777     #[test]
regions_lt_4gb_nobios()778     fn regions_lt_4gb_nobios() {
779         let regions = arch_memory_regions(1u64 << 29, /* has_bios */ false);
780         assert_eq!(1, regions.len());
781         assert_eq!(GuestAddress(0), regions[0].0);
782         assert_eq!(1u64 << 29, regions[0].1);
783     }
784 
785     #[test]
regions_gt_4gb_nobios()786     fn regions_gt_4gb_nobios() {
787         let regions = arch_memory_regions((1u64 << 32) + 0x8000, /* has_bios */ false);
788         assert_eq!(2, regions.len());
789         assert_eq!(GuestAddress(0), regions[0].0);
790         assert_eq!(GuestAddress(1u64 << 32), regions[1].0);
791     }
792 
793     #[test]
regions_lt_4gb_bios()794     fn regions_lt_4gb_bios() {
795         let regions = arch_memory_regions(1u64 << 29, /* has_bios */ true);
796         assert_eq!(2, regions.len());
797         assert_eq!(GuestAddress(0), regions[0].0);
798         assert_eq!(1u64 << 29, regions[0].1);
799         assert_eq!(GuestAddress(BIOS_START), regions[1].0);
800         assert_eq!(BIOS_LEN as u64, regions[1].1);
801     }
802 
803     #[test]
regions_gt_4gb_bios()804     fn regions_gt_4gb_bios() {
805         let regions = arch_memory_regions((1u64 << 32) + 0x8000, /* has_bios */ true);
806         assert_eq!(2, regions.len());
807         assert_eq!(GuestAddress(0), regions[0].0);
808         assert_eq!(GuestAddress(BIOS_START), regions[1].0);
809     }
810 }
811