• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2018 The ChromiumOS Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 //! Virtual machine architecture support code.
6 
7 pub mod android;
8 pub mod fdt;
9 pub mod pstore;
10 pub mod serial;
11 
12 pub mod sys;
13 
14 use std::collections::BTreeMap;
15 use std::error::Error as StdError;
16 use std::fs::File;
17 use std::io;
18 use std::ops::Deref;
19 use std::path::PathBuf;
20 use std::str::FromStr;
21 use std::sync::mpsc;
22 use std::sync::mpsc::SendError;
23 use std::sync::Arc;
24 
25 use acpi_tables::sdt::SDT;
26 use base::syslog;
27 use base::AsRawDescriptors;
28 use base::FileGetLen;
29 use base::FileReadWriteAtVolatile;
30 use base::RecvTube;
31 use base::SendTube;
32 use base::Tube;
33 use devices::virtio::VirtioDevice;
34 use devices::BarRange;
35 use devices::Bus;
36 use devices::BusDevice;
37 use devices::BusDeviceObj;
38 use devices::BusError;
39 use devices::BusResumeDevice;
40 use devices::FwCfgParameters;
41 use devices::GpeScope;
42 use devices::HotPlugBus;
43 use devices::IrqChip;
44 use devices::IrqEventSource;
45 use devices::PciAddress;
46 use devices::PciBus;
47 use devices::PciDevice;
48 use devices::PciDeviceError;
49 use devices::PciInterruptPin;
50 use devices::PciRoot;
51 use devices::PciRootCommand;
52 use devices::PreferredIrq;
53 #[cfg(any(target_os = "android", target_os = "linux"))]
54 use devices::ProxyDevice;
55 use devices::SerialHardware;
56 use devices::SerialParameters;
57 pub use fdt::apply_device_tree_overlays;
58 pub use fdt::DtbOverlay;
59 #[cfg(feature = "gdb")]
60 use gdbstub::arch::Arch;
61 use hypervisor::MemCacheType;
62 use hypervisor::Vm;
63 #[cfg(windows)]
64 use jail::FakeMinijailStub as Minijail;
65 #[cfg(any(target_os = "android", target_os = "linux"))]
66 use minijail::Minijail;
67 use remain::sorted;
68 use resources::SystemAllocator;
69 use resources::SystemAllocatorConfig;
70 use serde::de::Visitor;
71 use serde::Deserialize;
72 use serde::Serialize;
73 use serde_keyvalue::FromKeyValues;
74 pub use serial::add_serial_devices;
75 pub use serial::get_serial_cmdline;
76 pub use serial::set_default_serial_parameters;
77 pub use serial::GetSerialCmdlineError;
78 pub use serial::SERIAL_ADDR;
79 use sync::Condvar;
80 use sync::Mutex;
81 #[cfg(any(target_os = "android", target_os = "linux"))]
82 pub use sys::linux::PlatformBusResources;
83 use thiserror::Error;
84 use uuid::Uuid;
85 use vm_control::BatControl;
86 use vm_control::BatteryType;
87 use vm_control::PmResource;
88 use vm_memory::GuestAddress;
89 use vm_memory::GuestMemory;
90 use vm_memory::GuestMemoryError;
91 use vm_memory::MemoryRegionInformation;
92 use vm_memory::MemoryRegionOptions;
93 
94 cfg_if::cfg_if! {
95     if #[cfg(any(target_arch = "arm", target_arch = "aarch64"))] {
96         pub use devices::IrqChipAArch64 as IrqChipArch;
97         #[cfg(feature = "gdb")]
98         pub use gdbstub_arch::aarch64::AArch64 as GdbArch;
99         pub use hypervisor::CpuConfigAArch64 as CpuConfigArch;
100         pub use hypervisor::Hypervisor as HypervisorArch;
101         pub use hypervisor::VcpuAArch64 as VcpuArch;
102         pub use hypervisor::VcpuInitAArch64 as VcpuInitArch;
103         pub use hypervisor::VmAArch64 as VmArch;
104     } else if #[cfg(target_arch = "riscv64")] {
105         pub use devices::IrqChipRiscv64 as IrqChipArch;
106         #[cfg(feature = "gdb")]
107         pub use gdbstub_arch::riscv::Riscv64 as GdbArch;
108         pub use hypervisor::CpuConfigRiscv64 as CpuConfigArch;
109         pub use hypervisor::Hypervisor as HypervisorArch;
110         pub use hypervisor::VcpuInitRiscv64 as VcpuInitArch;
111         pub use hypervisor::VcpuRiscv64 as VcpuArch;
112         pub use hypervisor::VmRiscv64 as VmArch;
113     } else if #[cfg(target_arch = "x86_64")] {
114         pub use devices::IrqChipX86_64 as IrqChipArch;
115         #[cfg(feature = "gdb")]
116         pub use gdbstub_arch::x86::X86_64_SSE as GdbArch;
117         pub use hypervisor::CpuConfigX86_64 as CpuConfigArch;
118         pub use hypervisor::HypervisorX86_64 as HypervisorArch;
119         pub use hypervisor::VcpuInitX86_64 as VcpuInitArch;
120         pub use hypervisor::VcpuX86_64 as VcpuArch;
121         pub use hypervisor::VmX86_64 as VmArch;
122     }
123 }
124 
125 pub enum VmImage {
126     Kernel(File),
127     Bios(File),
128 }
129 
130 #[derive(Clone, Debug, Deserialize, Serialize, FromKeyValues, PartialEq, Eq)]
131 #[serde(deny_unknown_fields, rename_all = "kebab-case")]
132 pub struct Pstore {
133     pub path: PathBuf,
134     pub size: u32,
135 }
136 
137 #[derive(Clone, Copy, Debug, Serialize, Deserialize, FromKeyValues)]
138 #[serde(deny_unknown_fields, rename_all = "kebab-case")]
139 pub enum FdtPosition {
140     /// At the start of RAM.
141     Start,
142     /// Near the end of RAM.
143     End,
144     /// After the payload, with some padding for alignment.
145     AfterPayload,
146 }
147 
148 /// Set of CPU cores.
149 #[derive(Clone, Debug, Default, PartialEq, Eq, PartialOrd, Ord)]
150 pub struct CpuSet(Vec<usize>);
151 
152 impl CpuSet {
new<I: IntoIterator<Item = usize>>(cpus: I) -> Self153     pub fn new<I: IntoIterator<Item = usize>>(cpus: I) -> Self {
154         CpuSet(cpus.into_iter().collect())
155     }
156 
iter(&self) -> std::slice::Iter<'_, usize>157     pub fn iter(&self) -> std::slice::Iter<'_, usize> {
158         self.0.iter()
159     }
160 }
161 
162 impl FromIterator<usize> for CpuSet {
from_iter<T>(iter: T) -> Self where T: IntoIterator<Item = usize>,163     fn from_iter<T>(iter: T) -> Self
164     where
165         T: IntoIterator<Item = usize>,
166     {
167         CpuSet::new(iter)
168     }
169 }
170 
171 /// The SVE config for Vcpus.
172 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
173 #[derive(Copy, Clone, Debug, Default, PartialEq, Eq, Deserialize, Serialize)]
174 #[serde(deny_unknown_fields, rename_all = "kebab-case")]
175 pub struct SveConfig {
176     /// Use SVE
177     #[serde(default)]
178     pub enable: bool,
179     /// Detect if SVE is available and enable accordingly. `enable` is ignored if auto is true
180     #[serde(default)]
181     pub auto: bool,
182 }
183 
parse_cpu_range(s: &str, cpuset: &mut Vec<usize>) -> Result<(), String>184 fn parse_cpu_range(s: &str, cpuset: &mut Vec<usize>) -> Result<(), String> {
185     fn parse_cpu(s: &str) -> Result<usize, String> {
186         s.parse().map_err(|_| {
187             format!(
188                 "invalid CPU index {} - index must be a non-negative integer",
189                 s
190             )
191         })
192     }
193 
194     let (first_cpu, last_cpu) = match s.split_once('-') {
195         Some((first_cpu, last_cpu)) => {
196             let first_cpu = parse_cpu(first_cpu)?;
197             let last_cpu = parse_cpu(last_cpu)?;
198 
199             if last_cpu < first_cpu {
200                 return Err(format!(
201                     "invalid CPU range {} - ranges must be from low to high",
202                     s
203                 ));
204             }
205             (first_cpu, last_cpu)
206         }
207         None => {
208             let cpu = parse_cpu(s)?;
209             (cpu, cpu)
210         }
211     };
212 
213     cpuset.extend(first_cpu..=last_cpu);
214 
215     Ok(())
216 }
217 
218 impl FromStr for CpuSet {
219     type Err = String;
220 
from_str(s: &str) -> Result<Self, Self::Err>221     fn from_str(s: &str) -> Result<Self, Self::Err> {
222         let mut cpuset = Vec::new();
223         for part in s.split(',') {
224             parse_cpu_range(part, &mut cpuset)?;
225         }
226         Ok(CpuSet::new(cpuset))
227     }
228 }
229 
230 impl Deref for CpuSet {
231     type Target = Vec<usize>;
232 
deref(&self) -> &Self::Target233     fn deref(&self) -> &Self::Target {
234         &self.0
235     }
236 }
237 
238 impl IntoIterator for CpuSet {
239     type Item = usize;
240     type IntoIter = std::vec::IntoIter<Self::Item>;
241 
into_iter(self) -> Self::IntoIter242     fn into_iter(self) -> Self::IntoIter {
243         self.0.into_iter()
244     }
245 }
246 
247 /// Deserializes a `CpuSet` from a sequence which elements can either be integers, or strings
248 /// representing CPU ranges (e.g. `5-8`).
249 impl<'de> Deserialize<'de> for CpuSet {
deserialize<D>(deserializer: D) -> Result<Self, D::Error> where D: serde::Deserializer<'de>,250     fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
251     where
252         D: serde::Deserializer<'de>,
253     {
254         struct CpuSetVisitor;
255         impl<'de> Visitor<'de> for CpuSetVisitor {
256             type Value = CpuSet;
257 
258             fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
259                 formatter.write_str("CpuSet")
260             }
261 
262             fn visit_seq<A>(self, mut seq: A) -> Result<Self::Value, A::Error>
263             where
264                 A: serde::de::SeqAccess<'de>,
265             {
266                 #[derive(Deserialize)]
267                 #[serde(untagged)]
268                 enum CpuSetValue<'a> {
269                     Single(usize),
270                     Range(&'a str),
271                 }
272 
273                 let mut cpus = Vec::new();
274                 while let Some(cpuset) = seq.next_element::<CpuSetValue>()? {
275                     match cpuset {
276                         CpuSetValue::Single(cpu) => cpus.push(cpu),
277                         CpuSetValue::Range(range) => {
278                             parse_cpu_range(range, &mut cpus).map_err(serde::de::Error::custom)?;
279                         }
280                     }
281                 }
282 
283                 Ok(CpuSet::new(cpus))
284             }
285         }
286 
287         deserializer.deserialize_seq(CpuSetVisitor)
288     }
289 }
290 
291 /// Serializes a `CpuSet` into a sequence of integers and strings representing CPU ranges.
292 impl Serialize for CpuSet {
serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error> where S: serde::Serializer,293     fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
294     where
295         S: serde::Serializer,
296     {
297         use serde::ser::SerializeSeq;
298 
299         let mut seq = serializer.serialize_seq(None)?;
300 
301         // Factorize ranges into "a-b" strings.
302         let mut serialize_range = |start: usize, end: usize| -> Result<(), S::Error> {
303             if start == end {
304                 seq.serialize_element(&start)?;
305             } else {
306                 seq.serialize_element(&format!("{}-{}", start, end))?;
307             }
308 
309             Ok(())
310         };
311 
312         // Current range.
313         let mut range = None;
314         for core in &self.0 {
315             range = match range {
316                 None => Some((core, core)),
317                 Some((start, end)) if *end == *core - 1 => Some((start, core)),
318                 Some((start, end)) => {
319                     serialize_range(*start, *end)?;
320                     Some((core, core))
321                 }
322             };
323         }
324 
325         if let Some((start, end)) = range {
326             serialize_range(*start, *end)?;
327         }
328 
329         seq.end()
330     }
331 }
332 
333 /// Mapping of guest VCPU threads to host CPU cores.
334 #[derive(Clone, Debug, Deserialize, PartialEq, Eq, Serialize)]
335 pub enum VcpuAffinity {
336     /// All VCPU threads will be pinned to the same set of host CPU cores.
337     Global(CpuSet),
338     /// Each VCPU may be pinned to a set of host CPU cores.
339     /// The map key is a guest VCPU index, and the corresponding value is the set of
340     /// host CPU indices that the VCPU thread will be allowed to run on.
341     /// If a VCPU index is not present in the map, its affinity will not be set.
342     PerVcpu(BTreeMap<usize, CpuSet>),
343 }
344 
345 /// Memory region with optional size.
346 #[derive(Clone, Copy, Debug, PartialEq, Eq, Serialize, Deserialize, FromKeyValues)]
347 pub struct MemoryRegionConfig {
348     pub start: u64,
349     pub size: Option<u64>,
350 }
351 
352 /// General PCI config.
353 #[derive(Clone, Copy, Debug, Default, PartialEq, Eq, Serialize, Deserialize, FromKeyValues)]
354 pub struct PciConfig {
355     /// region for PCI Configuration Access Mechanism
356     #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
357     pub cam: Option<MemoryRegionConfig>,
358     /// region for PCIe Enhanced Configuration Access Mechanism
359     #[cfg(target_arch = "x86_64")]
360     pub ecam: Option<MemoryRegionConfig>,
361     /// region for non-prefetchable PCI device memory below 4G
362     pub mem: Option<MemoryRegionConfig>,
363 }
364 
365 /// Holds the pieces needed to build a VM. Passed to `build_vm` in the `LinuxArch` trait below to
366 /// create a `RunnableLinuxVm`.
367 #[sorted]
368 pub struct VmComponents {
369     #[cfg(all(target_arch = "x86_64", unix))]
370     pub ac_adapter: bool,
371     pub acpi_sdts: Vec<SDT>,
372     pub android_fstab: Option<File>,
373     pub boot_cpu: usize,
374     pub bootorder_fw_cfg_blob: Vec<u8>,
375     #[cfg(target_arch = "x86_64")]
376     pub break_linux_pci_config_io: bool,
377     pub cpu_capacity: BTreeMap<usize, u32>,
378     pub cpu_clusters: Vec<CpuSet>,
379     #[cfg(all(
380         any(target_arch = "arm", target_arch = "aarch64"),
381         any(target_os = "android", target_os = "linux")
382     ))]
383     pub cpu_frequencies: BTreeMap<usize, Vec<u32>>,
384     pub delay_rt: bool,
385     pub dynamic_power_coefficient: BTreeMap<usize, u32>,
386     pub extra_kernel_params: Vec<String>,
387     #[cfg(target_arch = "x86_64")]
388     pub force_s2idle: bool,
389     pub fw_cfg_enable: bool,
390     pub fw_cfg_parameters: Vec<FwCfgParameters>,
391     pub host_cpu_topology: bool,
392     pub hugepages: bool,
393     pub hv_cfg: hypervisor::Config,
394     pub initrd_image: Option<File>,
395     pub itmt: bool,
396     pub memory_size: u64,
397     pub no_i8042: bool,
398     pub no_rtc: bool,
399     pub no_smt: bool,
400     #[cfg(all(
401         any(target_arch = "arm", target_arch = "aarch64"),
402         any(target_os = "android", target_os = "linux")
403     ))]
404     pub normalized_cpu_ipc_ratios: BTreeMap<usize, u32>,
405     pub pci_config: PciConfig,
406     pub pflash_block_size: u32,
407     pub pflash_image: Option<File>,
408     pub pstore: Option<Pstore>,
409     /// A file to load as pVM firmware. Must be `Some` iff
410     /// `hv_cfg.protection_type == ProtectionType::UnprotectedWithFirmware`.
411     pub pvm_fw: Option<File>,
412     pub rt_cpus: CpuSet,
413     #[cfg(target_arch = "x86_64")]
414     pub smbios: SmbiosOptions,
415     #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
416     pub sve_config: SveConfig,
417     pub swiotlb: Option<u64>,
418     pub vcpu_affinity: Option<VcpuAffinity>,
419     pub vcpu_count: usize,
420     #[cfg(all(
421         any(target_arch = "arm", target_arch = "aarch64"),
422         any(target_os = "android", target_os = "linux")
423     ))]
424     pub vcpu_domain_paths: BTreeMap<usize, PathBuf>,
425     #[cfg(all(
426         any(target_arch = "arm", target_arch = "aarch64"),
427         any(target_os = "android", target_os = "linux")
428     ))]
429     pub vcpu_domains: BTreeMap<usize, u32>,
430     #[cfg(all(
431         any(target_arch = "arm", target_arch = "aarch64"),
432         any(target_os = "android", target_os = "linux")
433     ))]
434     pub virt_cpufreq_v2: bool,
435     pub vm_image: VmImage,
436 }
437 
438 /// Holds the elements needed to run a Linux VM. Created by `build_vm`.
439 #[sorted]
440 pub struct RunnableLinuxVm<V: VmArch, Vcpu: VcpuArch> {
441     pub bat_control: Option<BatControl>,
442     pub delay_rt: bool,
443     pub devices_thread: Option<std::thread::JoinHandle<()>>,
444     pub hotplug_bus: BTreeMap<u8, Arc<Mutex<dyn HotPlugBus>>>,
445     pub io_bus: Arc<Bus>,
446     pub irq_chip: Box<dyn IrqChipArch>,
447     pub mmio_bus: Arc<Bus>,
448     pub no_smt: bool,
449     pub pid_debug_label_map: BTreeMap<u32, String>,
450     #[cfg(any(target_os = "android", target_os = "linux"))]
451     pub platform_devices: Vec<Arc<Mutex<dyn BusDevice>>>,
452     pub pm: Option<Arc<Mutex<dyn PmResource + Send>>>,
453     /// Devices to be notified before the system resumes from the S3 suspended state.
454     pub resume_notify_devices: Vec<Arc<Mutex<dyn BusResumeDevice>>>,
455     pub root_config: Arc<Mutex<PciRoot>>,
456     pub rt_cpus: CpuSet,
457     pub suspend_tube: (Arc<Mutex<SendTube>>, RecvTube),
458     pub vcpu_affinity: Option<VcpuAffinity>,
459     pub vcpu_count: usize,
460     pub vcpu_init: Vec<VcpuInitArch>,
461     /// If vcpus is None, then it's the responsibility of the vcpu thread to create vcpus.
462     /// If it's Some, then `build_vm` already created the vcpus.
463     pub vcpus: Option<Vec<Vcpu>>,
464     pub vm: V,
465     pub vm_request_tubes: Vec<Tube>,
466 }
467 
468 /// The device and optional jail.
469 pub struct VirtioDeviceStub {
470     pub dev: Box<dyn VirtioDevice>,
471     pub jail: Option<Minijail>,
472 }
473 
474 /// Trait which is implemented for each Linux Architecture in order to
475 /// set up the memory, cpus, and system devices and to boot the kernel.
476 pub trait LinuxArch {
477     type Error: StdError;
478     type ArchMemoryLayout;
479 
480     /// Decide architecture specific memory layout details to be used by later stages of the VM
481     /// setup.
arch_memory_layout( components: &VmComponents, ) -> std::result::Result<Self::ArchMemoryLayout, Self::Error>482     fn arch_memory_layout(
483         components: &VmComponents,
484     ) -> std::result::Result<Self::ArchMemoryLayout, Self::Error>;
485 
486     /// Returns a Vec of the valid memory addresses as pairs of address and length. These should be
487     /// used to configure the `GuestMemory` structure for the platform.
488     ///
489     /// # Arguments
490     ///
491     /// * `components` - Parts used to determine the memory layout.
guest_memory_layout( components: &VmComponents, arch_memory_layout: &Self::ArchMemoryLayout, hypervisor: &impl hypervisor::Hypervisor, ) -> std::result::Result<Vec<(GuestAddress, u64, MemoryRegionOptions)>, Self::Error>492     fn guest_memory_layout(
493         components: &VmComponents,
494         arch_memory_layout: &Self::ArchMemoryLayout,
495         hypervisor: &impl hypervisor::Hypervisor,
496     ) -> std::result::Result<Vec<(GuestAddress, u64, MemoryRegionOptions)>, Self::Error>;
497 
498     /// Gets the configuration for a new `SystemAllocator` that fits the given `Vm`'s memory layout.
499     ///
500     /// This is the per-architecture template for constructing the `SystemAllocator`. Platform
501     /// agnostic modifications may be made to this configuration, but the final `SystemAllocator`
502     /// will be at least as strict as this configuration.
503     ///
504     /// # Arguments
505     ///
506     /// * `vm` - The virtual machine to be used as a template for the `SystemAllocator`.
get_system_allocator_config<V: Vm>( vm: &V, arch_memory_layout: &Self::ArchMemoryLayout, ) -> SystemAllocatorConfig507     fn get_system_allocator_config<V: Vm>(
508         vm: &V,
509         arch_memory_layout: &Self::ArchMemoryLayout,
510     ) -> SystemAllocatorConfig;
511 
512     /// Takes `VmComponents` and generates a `RunnableLinuxVm`.
513     ///
514     /// # Arguments
515     ///
516     /// * `components` - Parts to use to build the VM.
517     /// * `vm_evt_wrtube` - Tube used by sub-devices to request that crosvm exit because guest wants
518     ///   to stop/shut down or requested reset.
519     /// * `system_allocator` - Allocator created by this trait's implementation of
520     ///   `get_system_allocator_config`.
521     /// * `serial_parameters` - Definitions for how the serial devices should be configured.
522     /// * `serial_jail` - Jail used for serial devices created here.
523     /// * `battery` - Defines what battery device will be created.
524     /// * `vm` - A VM implementation to build upon.
525     /// * `ramoops_region` - Region allocated for ramoops.
526     /// * `devices` - The devices to be built into the VM.
527     /// * `irq_chip` - The IRQ chip implemention for the VM.
528     /// * `debugcon_jail` - Jail used for debugcon devices created here.
529     /// * `pflash_jail` - Jail used for pflash device created here.
530     /// * `fw_cfg_jail` - Jail used for fw_cfg device created here.
531     /// * `device_tree_overlays` - Device tree overlay binaries
build_vm<V, Vcpu>( components: VmComponents, arch_memory_layout: &Self::ArchMemoryLayout, vm_evt_wrtube: &SendTube, system_allocator: &mut SystemAllocator, serial_parameters: &BTreeMap<(SerialHardware, u8), SerialParameters>, serial_jail: Option<Minijail>, battery: (Option<BatteryType>, Option<Minijail>), vm: V, ramoops_region: Option<pstore::RamoopsRegion>, devices: Vec<(Box<dyn BusDeviceObj>, Option<Minijail>)>, irq_chip: &mut dyn IrqChipArch, vcpu_ids: &mut Vec<usize>, dump_device_tree_blob: Option<PathBuf>, debugcon_jail: Option<Minijail>, #[cfg(target_arch = "x86_64")] pflash_jail: Option<Minijail>, #[cfg(target_arch = "x86_64")] fw_cfg_jail: Option<Minijail>, #[cfg(feature = "swap")] swap_controller: &mut Option<swap::SwapController>, guest_suspended_cvar: Option<Arc<(Mutex<bool>, Condvar)>>, device_tree_overlays: Vec<DtbOverlay>, fdt_position: Option<FdtPosition>, no_pmu: bool, ) -> std::result::Result<RunnableLinuxVm<V, Vcpu>, Self::Error> where V: VmArch, Vcpu: VcpuArch532     fn build_vm<V, Vcpu>(
533         components: VmComponents,
534         arch_memory_layout: &Self::ArchMemoryLayout,
535         vm_evt_wrtube: &SendTube,
536         system_allocator: &mut SystemAllocator,
537         serial_parameters: &BTreeMap<(SerialHardware, u8), SerialParameters>,
538         serial_jail: Option<Minijail>,
539         battery: (Option<BatteryType>, Option<Minijail>),
540         vm: V,
541         ramoops_region: Option<pstore::RamoopsRegion>,
542         devices: Vec<(Box<dyn BusDeviceObj>, Option<Minijail>)>,
543         irq_chip: &mut dyn IrqChipArch,
544         vcpu_ids: &mut Vec<usize>,
545         dump_device_tree_blob: Option<PathBuf>,
546         debugcon_jail: Option<Minijail>,
547         #[cfg(target_arch = "x86_64")] pflash_jail: Option<Minijail>,
548         #[cfg(target_arch = "x86_64")] fw_cfg_jail: Option<Minijail>,
549         #[cfg(feature = "swap")] swap_controller: &mut Option<swap::SwapController>,
550         guest_suspended_cvar: Option<Arc<(Mutex<bool>, Condvar)>>,
551         device_tree_overlays: Vec<DtbOverlay>,
552         fdt_position: Option<FdtPosition>,
553         no_pmu: bool,
554     ) -> std::result::Result<RunnableLinuxVm<V, Vcpu>, Self::Error>
555     where
556         V: VmArch,
557         Vcpu: VcpuArch;
558 
559     /// Configures the vcpu and should be called once per vcpu from the vcpu's thread.
560     ///
561     /// # Arguments
562     ///
563     /// * `vm` - The virtual machine object.
564     /// * `hypervisor` - The `Hypervisor` that created the vcpu.
565     /// * `irq_chip` - The `IrqChip` associated with this vm.
566     /// * `vcpu` - The VCPU object to configure.
567     /// * `vcpu_init` - The data required to initialize VCPU registers and other state.
568     /// * `vcpu_id` - The id of the given `vcpu`.
569     /// * `num_cpus` - Number of virtual CPUs the guest will have.
570     /// * `cpu_config` - CPU feature configurations.
configure_vcpu<V: Vm>( vm: &V, hypervisor: &dyn HypervisorArch, irq_chip: &mut dyn IrqChipArch, vcpu: &mut dyn VcpuArch, vcpu_init: VcpuInitArch, vcpu_id: usize, num_cpus: usize, cpu_config: Option<CpuConfigArch>, ) -> Result<(), Self::Error>571     fn configure_vcpu<V: Vm>(
572         vm: &V,
573         hypervisor: &dyn HypervisorArch,
574         irq_chip: &mut dyn IrqChipArch,
575         vcpu: &mut dyn VcpuArch,
576         vcpu_init: VcpuInitArch,
577         vcpu_id: usize,
578         num_cpus: usize,
579         cpu_config: Option<CpuConfigArch>,
580     ) -> Result<(), Self::Error>;
581 
582     /// Configures and add a pci device into vm
register_pci_device<V: VmArch, Vcpu: VcpuArch>( linux: &mut RunnableLinuxVm<V, Vcpu>, device: Box<dyn PciDevice>, #[cfg(any(target_os = "android", target_os = "linux"))] minijail: Option<Minijail>, resources: &mut SystemAllocator, hp_control_tube: &mpsc::Sender<PciRootCommand>, #[cfg(feature = "swap")] swap_controller: &mut Option<swap::SwapController>, ) -> Result<PciAddress, Self::Error>583     fn register_pci_device<V: VmArch, Vcpu: VcpuArch>(
584         linux: &mut RunnableLinuxVm<V, Vcpu>,
585         device: Box<dyn PciDevice>,
586         #[cfg(any(target_os = "android", target_os = "linux"))] minijail: Option<Minijail>,
587         resources: &mut SystemAllocator,
588         hp_control_tube: &mpsc::Sender<PciRootCommand>,
589         #[cfg(feature = "swap")] swap_controller: &mut Option<swap::SwapController>,
590     ) -> Result<PciAddress, Self::Error>;
591 
592     /// Returns frequency map for each of the host's logical cores.
get_host_cpu_frequencies_khz() -> Result<BTreeMap<usize, Vec<u32>>, Self::Error>593     fn get_host_cpu_frequencies_khz() -> Result<BTreeMap<usize, Vec<u32>>, Self::Error>;
594 
595     /// Returns max-freq map of the host's logical cores.
get_host_cpu_max_freq_khz() -> Result<BTreeMap<usize, u32>, Self::Error>596     fn get_host_cpu_max_freq_khz() -> Result<BTreeMap<usize, u32>, Self::Error>;
597 
598     /// Returns capacity map of the host's logical cores.
get_host_cpu_capacity() -> Result<BTreeMap<usize, u32>, Self::Error>599     fn get_host_cpu_capacity() -> Result<BTreeMap<usize, u32>, Self::Error>;
600 
601     /// Returns cluster masks for each of the host's logical cores.
get_host_cpu_clusters() -> Result<Vec<CpuSet>, Self::Error>602     fn get_host_cpu_clusters() -> Result<Vec<CpuSet>, Self::Error>;
603 }
604 
605 #[cfg(feature = "gdb")]
606 pub trait GdbOps<T: VcpuArch> {
607     type Error: StdError;
608 
609     /// Reads vCPU's registers.
read_registers(vcpu: &T) -> Result<<GdbArch as Arch>::Registers, Self::Error>610     fn read_registers(vcpu: &T) -> Result<<GdbArch as Arch>::Registers, Self::Error>;
611 
612     /// Writes vCPU's registers.
write_registers(vcpu: &T, regs: &<GdbArch as Arch>::Registers) -> Result<(), Self::Error>613     fn write_registers(vcpu: &T, regs: &<GdbArch as Arch>::Registers) -> Result<(), Self::Error>;
614 
615     /// Reads bytes from the guest memory.
read_memory( vcpu: &T, guest_mem: &GuestMemory, vaddr: GuestAddress, len: usize, ) -> Result<Vec<u8>, Self::Error>616     fn read_memory(
617         vcpu: &T,
618         guest_mem: &GuestMemory,
619         vaddr: GuestAddress,
620         len: usize,
621     ) -> Result<Vec<u8>, Self::Error>;
622 
623     /// Writes bytes to the specified guest memory.
write_memory( vcpu: &T, guest_mem: &GuestMemory, vaddr: GuestAddress, buf: &[u8], ) -> Result<(), Self::Error>624     fn write_memory(
625         vcpu: &T,
626         guest_mem: &GuestMemory,
627         vaddr: GuestAddress,
628         buf: &[u8],
629     ) -> Result<(), Self::Error>;
630 
631     /// Reads bytes from the guest register.
632     ///
633     /// Returns an empty vector if `reg_id` is valid but the register is not available.
read_register(vcpu: &T, reg_id: <GdbArch as Arch>::RegId) -> Result<Vec<u8>, Self::Error>634     fn read_register(vcpu: &T, reg_id: <GdbArch as Arch>::RegId) -> Result<Vec<u8>, Self::Error>;
635 
636     /// Writes bytes to the specified guest register.
write_register( vcpu: &T, reg_id: <GdbArch as Arch>::RegId, data: &[u8], ) -> Result<(), Self::Error>637     fn write_register(
638         vcpu: &T,
639         reg_id: <GdbArch as Arch>::RegId,
640         data: &[u8],
641     ) -> Result<(), Self::Error>;
642 
643     /// Make the next vCPU's run single-step.
enable_singlestep(vcpu: &T) -> Result<(), Self::Error>644     fn enable_singlestep(vcpu: &T) -> Result<(), Self::Error>;
645 
646     /// Get maximum number of hardware breakpoints.
get_max_hw_breakpoints(vcpu: &T) -> Result<usize, Self::Error>647     fn get_max_hw_breakpoints(vcpu: &T) -> Result<usize, Self::Error>;
648 
649     /// Set hardware breakpoints at the given addresses.
set_hw_breakpoints(vcpu: &T, breakpoints: &[GuestAddress]) -> Result<(), Self::Error>650     fn set_hw_breakpoints(vcpu: &T, breakpoints: &[GuestAddress]) -> Result<(), Self::Error>;
651 }
652 
653 /// Errors for device manager.
654 #[sorted]
655 #[derive(Error, Debug)]
656 pub enum DeviceRegistrationError {
657     /// No more MMIO space available.
658     #[error("no more addresses are available")]
659     AddrsExhausted,
660     /// Could not allocate device address space for the device.
661     #[error("Allocating device addresses: {0}")]
662     AllocateDeviceAddrs(PciDeviceError),
663     /// Could not allocate IO space for the device.
664     #[error("Allocating IO addresses: {0}")]
665     AllocateIoAddrs(PciDeviceError),
666     /// Could not allocate MMIO or IO resource for the device.
667     #[error("Allocating IO resource: {0}")]
668     AllocateIoResource(resources::Error),
669     /// Could not allocate an IRQ number.
670     #[error("Allocating IRQ number")]
671     AllocateIrq,
672     /// Could not allocate IRQ resource for the device.
673     #[cfg(any(target_os = "android", target_os = "linux"))]
674     #[error("Allocating IRQ resource: {0}")]
675     AllocateIrqResource(devices::vfio::VfioError),
676     /// Broken pci topology
677     #[error("pci topology is broken")]
678     BrokenPciTopology,
679     /// Unable to clone a jail for the device.
680     #[cfg(any(target_os = "android", target_os = "linux"))]
681     #[error("failed to clone jail: {0}")]
682     CloneJail(minijail::Error),
683     /// Appending to kernel command line failed.
684     #[error("unable to add device to kernel command line: {0}")]
685     Cmdline(kernel_cmdline::Error),
686     /// Configure window size failed.
687     #[error("failed to configure window size: {0}")]
688     ConfigureWindowSize(PciDeviceError),
689     // Unable to create a pipe.
690     #[error("failed to create pipe: {0}")]
691     CreatePipe(base::Error),
692     // Unable to create a root.
693     #[error("failed to create pci root: {0}")]
694     CreateRoot(anyhow::Error),
695     // Unable to create serial device from serial parameters
696     #[error("failed to create serial device: {0}")]
697     CreateSerialDevice(devices::SerialError),
698     // Unable to create tube
699     #[error("failed to create tube: {0}")]
700     CreateTube(base::TubeError),
701     /// Could not clone an event.
702     #[error("failed to clone event: {0}")]
703     EventClone(base::Error),
704     /// Could not create an event.
705     #[error("failed to create event: {0}")]
706     EventCreate(base::Error),
707     /// Failed to generate ACPI content.
708     #[error("failed to generate ACPI content")]
709     GenerateAcpi,
710     /// No more IRQs are available.
711     #[error("no more IRQs are available")]
712     IrqsExhausted,
713     /// VFIO device is missing a DT symbol.
714     #[error("cannot match VFIO device to DT node due to a missing symbol")]
715     MissingDeviceTreeSymbol,
716     /// Missing a required serial device.
717     #[error("missing required serial device {0}")]
718     MissingRequiredSerialDevice(u8),
719     /// Could not add a device to the mmio bus.
720     #[error("failed to add to mmio bus: {0}")]
721     MmioInsert(BusError),
722     /// Failed to insert device into PCI root.
723     #[error("failed to insert device into PCI root: {0}")]
724     PciRootAddDevice(PciDeviceError),
725     #[cfg(any(target_os = "android", target_os = "linux"))]
726     /// Failed to initialize proxy device for jailed device.
727     #[error("failed to create proxy device: {0}")]
728     ProxyDeviceCreation(devices::ProxyError),
729     #[cfg(any(target_os = "android", target_os = "linux"))]
730     /// Failed to register battery device.
731     #[error("failed to register battery device to VM: {0}")]
732     RegisterBattery(devices::BatteryError),
733     /// Could not register PCI device to pci root bus
734     #[error("failed to register PCI device to pci root bus")]
735     RegisterDevice(SendError<PciRootCommand>),
736     /// Could not register PCI device capabilities.
737     #[error("could not register PCI device capabilities: {0}")]
738     RegisterDeviceCapabilities(PciDeviceError),
739     /// Failed to register ioevent with VM.
740     #[error("failed to register ioevent to VM: {0}")]
741     RegisterIoevent(base::Error),
742     /// Failed to register irq event with VM.
743     #[error("failed to register irq event to VM: {0}")]
744     RegisterIrqfd(base::Error),
745     /// Could not setup VFIO platform IRQ for the device.
746     #[error("Setting up VFIO platform IRQ: {0}")]
747     SetupVfioPlatformIrq(anyhow::Error),
748 }
749 
750 /// Config a PCI device for used by this vm.
configure_pci_device<V: VmArch, Vcpu: VcpuArch>( linux: &mut RunnableLinuxVm<V, Vcpu>, mut device: Box<dyn PciDevice>, #[cfg(any(target_os = "android", target_os = "linux"))] jail: Option<Minijail>, resources: &mut SystemAllocator, hp_control_tube: &mpsc::Sender<PciRootCommand>, #[cfg(feature = "swap")] swap_controller: &mut Option<swap::SwapController>, ) -> Result<PciAddress, DeviceRegistrationError>751 pub fn configure_pci_device<V: VmArch, Vcpu: VcpuArch>(
752     linux: &mut RunnableLinuxVm<V, Vcpu>,
753     mut device: Box<dyn PciDevice>,
754     #[cfg(any(target_os = "android", target_os = "linux"))] jail: Option<Minijail>,
755     resources: &mut SystemAllocator,
756     hp_control_tube: &mpsc::Sender<PciRootCommand>,
757     #[cfg(feature = "swap")] swap_controller: &mut Option<swap::SwapController>,
758 ) -> Result<PciAddress, DeviceRegistrationError> {
759     // Allocate PCI device address before allocating BARs.
760     let pci_address = device
761         .allocate_address(resources)
762         .map_err(DeviceRegistrationError::AllocateDeviceAddrs)?;
763 
764     // Allocate ranges that may need to be in the low MMIO region (MmioType::Low).
765     let mmio_ranges = device
766         .allocate_io_bars(resources)
767         .map_err(DeviceRegistrationError::AllocateIoAddrs)?;
768 
769     // Allocate device ranges that may be in low or high MMIO after low-only ranges.
770     let device_ranges = device
771         .allocate_device_bars(resources)
772         .map_err(DeviceRegistrationError::AllocateDeviceAddrs)?;
773 
774     // If device is a pcie bridge, add its pci bus to pci root
775     if let Some(pci_bus) = device.get_new_pci_bus() {
776         hp_control_tube
777             .send(PciRootCommand::AddBridge(pci_bus))
778             .map_err(DeviceRegistrationError::RegisterDevice)?;
779         let bar_ranges = Vec::new();
780         device
781             .configure_bridge_window(resources, &bar_ranges)
782             .map_err(DeviceRegistrationError::ConfigureWindowSize)?;
783     }
784 
785     // Do not suggest INTx for hot-plug devices.
786     let intx_event = devices::IrqLevelEvent::new().map_err(DeviceRegistrationError::EventCreate)?;
787 
788     if let PreferredIrq::Fixed { pin, gsi } = device.preferred_irq() {
789         resources.reserve_irq(gsi);
790 
791         device.assign_irq(
792             intx_event
793                 .try_clone()
794                 .map_err(DeviceRegistrationError::EventClone)?,
795             pin,
796             gsi,
797         );
798 
799         linux
800             .irq_chip
801             .as_irq_chip_mut()
802             .register_level_irq_event(gsi, &intx_event, IrqEventSource::from_device(&device))
803             .map_err(DeviceRegistrationError::RegisterIrqfd)?;
804     }
805 
806     let mut keep_rds = device.keep_rds();
807     syslog::push_descriptors(&mut keep_rds);
808     cros_tracing::push_descriptors!(&mut keep_rds);
809     metrics::push_descriptors(&mut keep_rds);
810 
811     device
812         .register_device_capabilities()
813         .map_err(DeviceRegistrationError::RegisterDeviceCapabilities)?;
814 
815     #[cfg(any(target_os = "android", target_os = "linux"))]
816     let arced_dev: Arc<Mutex<dyn BusDevice>> = if let Some(jail) = jail {
817         let proxy = ProxyDevice::new(
818             device,
819             jail,
820             keep_rds,
821             #[cfg(feature = "swap")]
822             swap_controller,
823         )
824         .map_err(DeviceRegistrationError::ProxyDeviceCreation)?;
825         linux
826             .pid_debug_label_map
827             .insert(proxy.pid() as u32, proxy.debug_label());
828         Arc::new(Mutex::new(proxy))
829     } else {
830         device.on_sandboxed();
831         Arc::new(Mutex::new(device))
832     };
833 
834     #[cfg(windows)]
835     let arced_dev = {
836         device.on_sandboxed();
837         Arc::new(Mutex::new(device))
838     };
839 
840     #[cfg(any(target_os = "android", target_os = "linux"))]
841     hp_control_tube
842         .send(PciRootCommand::Add(pci_address, arced_dev.clone()))
843         .map_err(DeviceRegistrationError::RegisterDevice)?;
844 
845     for range in &mmio_ranges {
846         linux
847             .mmio_bus
848             .insert(arced_dev.clone(), range.addr, range.size)
849             .map_err(DeviceRegistrationError::MmioInsert)?;
850     }
851 
852     for range in &device_ranges {
853         linux
854             .mmio_bus
855             .insert(arced_dev.clone(), range.addr, range.size)
856             .map_err(DeviceRegistrationError::MmioInsert)?;
857     }
858 
859     Ok(pci_address)
860 }
861 
862 // Generate pci topology starting from parent bus
generate_pci_topology( parent_bus: Arc<Mutex<PciBus>>, resources: &mut SystemAllocator, io_ranges: &mut BTreeMap<usize, Vec<BarRange>>, device_ranges: &mut BTreeMap<usize, Vec<BarRange>>, device_addrs: &[PciAddress], devices: &mut Vec<(Box<dyn PciDevice>, Option<Minijail>)>, ) -> Result<(Vec<BarRange>, u8), DeviceRegistrationError>863 fn generate_pci_topology(
864     parent_bus: Arc<Mutex<PciBus>>,
865     resources: &mut SystemAllocator,
866     io_ranges: &mut BTreeMap<usize, Vec<BarRange>>,
867     device_ranges: &mut BTreeMap<usize, Vec<BarRange>>,
868     device_addrs: &[PciAddress],
869     devices: &mut Vec<(Box<dyn PciDevice>, Option<Minijail>)>,
870 ) -> Result<(Vec<BarRange>, u8), DeviceRegistrationError> {
871     let mut bar_ranges = Vec::new();
872     let bus_num = parent_bus.lock().get_bus_num();
873     let mut subordinate_bus = bus_num;
874     for (dev_idx, addr) in device_addrs.iter().enumerate() {
875         // Only target for devices that located on this bus
876         if addr.bus == bus_num {
877             // If this device is a pci bridge (a.k.a., it has a pci bus structure),
878             // create its topology recursively
879             if let Some(child_bus) = devices[dev_idx].0.get_new_pci_bus() {
880                 let (child_bar_ranges, child_sub_bus) = generate_pci_topology(
881                     child_bus.clone(),
882                     resources,
883                     io_ranges,
884                     device_ranges,
885                     device_addrs,
886                     devices,
887                 )?;
888                 let device = &mut devices[dev_idx].0;
889                 parent_bus
890                     .lock()
891                     .add_child_bus(child_bus.clone())
892                     .map_err(|_| DeviceRegistrationError::BrokenPciTopology)?;
893                 let bridge_window = device
894                     .configure_bridge_window(resources, &child_bar_ranges)
895                     .map_err(DeviceRegistrationError::ConfigureWindowSize)?;
896                 bar_ranges.extend(bridge_window);
897 
898                 let ranges = device
899                     .allocate_io_bars(resources)
900                     .map_err(DeviceRegistrationError::AllocateIoAddrs)?;
901                 io_ranges.insert(dev_idx, ranges.clone());
902                 bar_ranges.extend(ranges);
903 
904                 let ranges = device
905                     .allocate_device_bars(resources)
906                     .map_err(DeviceRegistrationError::AllocateDeviceAddrs)?;
907                 device_ranges.insert(dev_idx, ranges.clone());
908                 bar_ranges.extend(ranges);
909 
910                 device.set_subordinate_bus(child_sub_bus);
911 
912                 subordinate_bus = std::cmp::max(subordinate_bus, child_sub_bus);
913             }
914         }
915     }
916 
917     for (dev_idx, addr) in device_addrs.iter().enumerate() {
918         if addr.bus == bus_num {
919             let device = &mut devices[dev_idx].0;
920             // Allocate MMIO for non-bridge devices
921             if device.get_new_pci_bus().is_none() {
922                 let ranges = device
923                     .allocate_io_bars(resources)
924                     .map_err(DeviceRegistrationError::AllocateIoAddrs)?;
925                 io_ranges.insert(dev_idx, ranges.clone());
926                 bar_ranges.extend(ranges);
927 
928                 let ranges = device
929                     .allocate_device_bars(resources)
930                     .map_err(DeviceRegistrationError::AllocateDeviceAddrs)?;
931                 device_ranges.insert(dev_idx, ranges.clone());
932                 bar_ranges.extend(ranges);
933             }
934         }
935     }
936     Ok((bar_ranges, subordinate_bus))
937 }
938 
939 /// Ensure all PCI devices have an assigned PCI address.
assign_pci_addresses( devices: &mut [(Box<dyn BusDeviceObj>, Option<Minijail>)], resources: &mut SystemAllocator, ) -> Result<(), DeviceRegistrationError>940 pub fn assign_pci_addresses(
941     devices: &mut [(Box<dyn BusDeviceObj>, Option<Minijail>)],
942     resources: &mut SystemAllocator,
943 ) -> Result<(), DeviceRegistrationError> {
944     // First allocate devices with a preferred address.
945     for pci_device in devices
946         .iter_mut()
947         .filter_map(|(device, _jail)| device.as_pci_device_mut())
948         .filter(|pci_device| pci_device.preferred_address().is_some())
949     {
950         let _ = pci_device
951             .allocate_address(resources)
952             .map_err(DeviceRegistrationError::AllocateDeviceAddrs)?;
953     }
954 
955     // Then allocate addresses for the remaining devices.
956     for pci_device in devices
957         .iter_mut()
958         .filter_map(|(device, _jail)| device.as_pci_device_mut())
959         .filter(|pci_device| pci_device.preferred_address().is_none())
960     {
961         let _ = pci_device
962             .allocate_address(resources)
963             .map_err(DeviceRegistrationError::AllocateDeviceAddrs)?;
964     }
965 
966     Ok(())
967 }
968 
969 /// Creates a root PCI device for use by this Vm.
generate_pci_root( mut devices: Vec<(Box<dyn PciDevice>, Option<Minijail>)>, irq_chip: &mut dyn IrqChip, mmio_bus: Arc<Bus>, mmio_base: GuestAddress, mmio_register_bit_num: usize, io_bus: Arc<Bus>, resources: &mut SystemAllocator, vm: &mut impl Vm, max_irqs: usize, vcfg_base: Option<u64>, #[cfg(feature = "swap")] swap_controller: &mut Option<swap::SwapController>, ) -> Result< ( PciRoot, Vec<(PciAddress, u32, PciInterruptPin)>, BTreeMap<u32, String>, BTreeMap<PciAddress, Vec<u8>>, BTreeMap<PciAddress, Vec<u8>>, ), DeviceRegistrationError, >970 pub fn generate_pci_root(
971     mut devices: Vec<(Box<dyn PciDevice>, Option<Minijail>)>,
972     irq_chip: &mut dyn IrqChip,
973     mmio_bus: Arc<Bus>,
974     mmio_base: GuestAddress,
975     mmio_register_bit_num: usize,
976     io_bus: Arc<Bus>,
977     resources: &mut SystemAllocator,
978     vm: &mut impl Vm,
979     max_irqs: usize,
980     vcfg_base: Option<u64>,
981     #[cfg(feature = "swap")] swap_controller: &mut Option<swap::SwapController>,
982 ) -> Result<
983     (
984         PciRoot,
985         Vec<(PciAddress, u32, PciInterruptPin)>,
986         BTreeMap<u32, String>,
987         BTreeMap<PciAddress, Vec<u8>>,
988         BTreeMap<PciAddress, Vec<u8>>,
989     ),
990     DeviceRegistrationError,
991 > {
992     let mut device_addrs = Vec::new();
993 
994     for (device, _jail) in devices.iter_mut() {
995         let address = device
996             .allocate_address(resources)
997             .map_err(DeviceRegistrationError::AllocateDeviceAddrs)?;
998         device_addrs.push(address);
999     }
1000 
1001     let mut device_ranges = BTreeMap::new();
1002     let mut io_ranges = BTreeMap::new();
1003     let root_bus = Arc::new(Mutex::new(PciBus::new(0, 0, false)));
1004 
1005     generate_pci_topology(
1006         root_bus.clone(),
1007         resources,
1008         &mut io_ranges,
1009         &mut device_ranges,
1010         &device_addrs,
1011         &mut devices,
1012     )?;
1013 
1014     let mut root = PciRoot::new(
1015         vm,
1016         Arc::downgrade(&mmio_bus),
1017         mmio_base,
1018         mmio_register_bit_num,
1019         Arc::downgrade(&io_bus),
1020         root_bus,
1021     )
1022     .map_err(DeviceRegistrationError::CreateRoot)?;
1023     #[cfg_attr(windows, allow(unused_mut))]
1024     let mut pid_labels = BTreeMap::new();
1025 
1026     // Allocate legacy INTx
1027     let mut pci_irqs = Vec::new();
1028     let mut irqs: Vec<u32> = Vec::new();
1029 
1030     // Mapping of (bus, dev, pin) -> IRQ number.
1031     let mut dev_pin_irq = BTreeMap::new();
1032 
1033     for (dev_idx, (device, _jail)) in devices.iter_mut().enumerate() {
1034         let pci_address = device_addrs[dev_idx];
1035 
1036         let irq = match device.preferred_irq() {
1037             PreferredIrq::Fixed { pin, gsi } => {
1038                 // The device reported a preferred IRQ, so use that rather than allocating one.
1039                 resources.reserve_irq(gsi);
1040                 Some((pin, gsi))
1041             }
1042             PreferredIrq::Any => {
1043                 // The device did not provide a preferred IRQ but requested one, so allocate one.
1044 
1045                 // Choose a pin based on the slot's function number. Function 0 must always use
1046                 // INTA# for single-function devices per the PCI spec, and we choose to use INTA#
1047                 // for function 0 on multifunction devices and distribute the remaining functions
1048                 // evenly across the other pins.
1049                 let pin = match pci_address.func % 4 {
1050                     0 => PciInterruptPin::IntA,
1051                     1 => PciInterruptPin::IntB,
1052                     2 => PciInterruptPin::IntC,
1053                     _ => PciInterruptPin::IntD,
1054                 };
1055 
1056                 // If an IRQ number has already been assigned for a different function with this
1057                 // (bus, device, pin) combination, use it. Otherwise allocate a new one and insert
1058                 // it into the map.
1059                 let pin_key = (pci_address.bus, pci_address.dev, pin);
1060                 let irq_num = if let Some(irq_num) = dev_pin_irq.get(&pin_key) {
1061                     *irq_num
1062                 } else {
1063                     // If we have allocated fewer than `max_irqs` total, add a new irq to the `irqs`
1064                     // pool. Otherwise, share one of the existing `irqs`.
1065                     let irq_num = if irqs.len() < max_irqs {
1066                         let irq_num = resources
1067                             .allocate_irq()
1068                             .ok_or(DeviceRegistrationError::AllocateIrq)?;
1069                         irqs.push(irq_num);
1070                         irq_num
1071                     } else {
1072                         // Pick one of the existing IRQs to share, using `dev_idx` to distribute IRQ
1073                         // sharing evenly across devices.
1074                         irqs[dev_idx % max_irqs]
1075                     };
1076 
1077                     dev_pin_irq.insert(pin_key, irq_num);
1078                     irq_num
1079                 };
1080                 Some((pin, irq_num))
1081             }
1082             PreferredIrq::None => {
1083                 // The device does not want an INTx# IRQ.
1084                 None
1085             }
1086         };
1087 
1088         if let Some((pin, gsi)) = irq {
1089             let intx_event =
1090                 devices::IrqLevelEvent::new().map_err(DeviceRegistrationError::EventCreate)?;
1091 
1092             device.assign_irq(
1093                 intx_event
1094                     .try_clone()
1095                     .map_err(DeviceRegistrationError::EventClone)?,
1096                 pin,
1097                 gsi,
1098             );
1099 
1100             irq_chip
1101                 .register_level_irq_event(gsi, &intx_event, IrqEventSource::from_device(device))
1102                 .map_err(DeviceRegistrationError::RegisterIrqfd)?;
1103 
1104             pci_irqs.push((pci_address, gsi, pin));
1105         }
1106     }
1107 
1108     // To prevent issues where device's on_sandbox may spawn thread before all
1109     // sandboxed devices are sandboxed we partition iterator to go over sandboxed
1110     // first. This is needed on linux platforms. On windows, this is a no-op since
1111     // jails are always None, even for sandboxed devices.
1112     let devices = {
1113         let (sandboxed, non_sandboxed): (Vec<_>, Vec<_>) = devices
1114             .into_iter()
1115             .enumerate()
1116             .partition(|(_, (_, jail))| jail.is_some());
1117         sandboxed.into_iter().chain(non_sandboxed)
1118     };
1119 
1120     let mut amls = BTreeMap::new();
1121     let mut gpe_scope_amls = BTreeMap::new();
1122     for (dev_idx, dev_value) in devices {
1123         #[cfg(any(target_os = "android", target_os = "linux"))]
1124         let (mut device, jail) = dev_value;
1125         #[cfg(windows)]
1126         let (mut device, _) = dev_value;
1127         let address = device_addrs[dev_idx];
1128 
1129         let mut keep_rds = device.keep_rds();
1130         syslog::push_descriptors(&mut keep_rds);
1131         cros_tracing::push_descriptors!(&mut keep_rds);
1132         metrics::push_descriptors(&mut keep_rds);
1133         keep_rds.append(&mut vm.get_memory().as_raw_descriptors());
1134 
1135         let ranges = io_ranges.remove(&dev_idx).unwrap_or_default();
1136         let device_ranges = device_ranges.remove(&dev_idx).unwrap_or_default();
1137         device
1138             .register_device_capabilities()
1139             .map_err(DeviceRegistrationError::RegisterDeviceCapabilities)?;
1140 
1141         if let Some(vcfg_base) = vcfg_base {
1142             let (methods, shm) = device.generate_acpi_methods();
1143             if !methods.is_empty() {
1144                 amls.insert(address, methods);
1145             }
1146             if let Some((offset, mmap)) = shm {
1147                 let _ = vm.add_memory_region(
1148                     GuestAddress(vcfg_base + offset as u64),
1149                     Box::new(mmap),
1150                     false,
1151                     false,
1152                     MemCacheType::CacheCoherent,
1153                 );
1154             }
1155         }
1156         let gpe_nr = device.set_gpe(resources);
1157 
1158         #[cfg(any(target_os = "android", target_os = "linux"))]
1159         let arced_dev: Arc<Mutex<dyn BusDevice>> = if let Some(jail) = jail {
1160             let proxy = ProxyDevice::new(
1161                 device,
1162                 jail,
1163                 keep_rds,
1164                 #[cfg(feature = "swap")]
1165                 swap_controller,
1166             )
1167             .map_err(DeviceRegistrationError::ProxyDeviceCreation)?;
1168             pid_labels.insert(proxy.pid() as u32, proxy.debug_label());
1169             Arc::new(Mutex::new(proxy))
1170         } else {
1171             device.on_sandboxed();
1172             Arc::new(Mutex::new(device))
1173         };
1174         #[cfg(windows)]
1175         let arced_dev = {
1176             device.on_sandboxed();
1177             Arc::new(Mutex::new(device))
1178         };
1179         root.add_device(address, arced_dev.clone(), vm)
1180             .map_err(DeviceRegistrationError::PciRootAddDevice)?;
1181         for range in &ranges {
1182             mmio_bus
1183                 .insert(arced_dev.clone(), range.addr, range.size)
1184                 .map_err(DeviceRegistrationError::MmioInsert)?;
1185         }
1186 
1187         for range in &device_ranges {
1188             mmio_bus
1189                 .insert(arced_dev.clone(), range.addr, range.size)
1190                 .map_err(DeviceRegistrationError::MmioInsert)?;
1191         }
1192 
1193         if let Some(gpe_nr) = gpe_nr {
1194             if let Some(acpi_path) = root.acpi_path(&address) {
1195                 let mut gpe_aml = Vec::new();
1196 
1197                 GpeScope {}.cast_to_aml_bytes(
1198                     &mut gpe_aml,
1199                     gpe_nr,
1200                     format!("\\{}", acpi_path).as_str(),
1201                 );
1202                 if !gpe_aml.is_empty() {
1203                     gpe_scope_amls.insert(address, gpe_aml);
1204                 }
1205             }
1206         }
1207     }
1208 
1209     Ok((root, pci_irqs, pid_labels, amls, gpe_scope_amls))
1210 }
1211 
1212 /// Errors for image loading.
1213 #[sorted]
1214 #[derive(Error, Debug)]
1215 pub enum LoadImageError {
1216     #[error("Alignment not a power of two: {0}")]
1217     BadAlignment(u64),
1218     #[error("Getting image size failed: {0}")]
1219     GetLen(io::Error),
1220     #[error("GuestMemory get slice failed: {0}")]
1221     GuestMemorySlice(GuestMemoryError),
1222     #[error("Image size too large: {0}")]
1223     ImageSizeTooLarge(u64),
1224     #[error("No suitable memory region found")]
1225     NoSuitableMemoryRegion,
1226     #[error("Reading image into memory failed: {0}")]
1227     ReadToMemory(io::Error),
1228     #[error("Cannot load zero-sized image")]
1229     ZeroSizedImage,
1230 }
1231 
1232 /// Load an image from a file into guest memory.
1233 ///
1234 /// # Arguments
1235 ///
1236 /// * `guest_mem` - The memory to be used by the guest.
1237 /// * `guest_addr` - The starting address to load the image in the guest memory.
1238 /// * `max_size` - The amount of space in bytes available in the guest memory for the image.
1239 /// * `image` - The file containing the image to be loaded.
1240 ///
1241 /// The size in bytes of the loaded image is returned.
load_image<F>( guest_mem: &GuestMemory, image: &mut F, guest_addr: GuestAddress, max_size: u64, ) -> Result<usize, LoadImageError> where F: FileReadWriteAtVolatile + FileGetLen,1242 pub fn load_image<F>(
1243     guest_mem: &GuestMemory,
1244     image: &mut F,
1245     guest_addr: GuestAddress,
1246     max_size: u64,
1247 ) -> Result<usize, LoadImageError>
1248 where
1249     F: FileReadWriteAtVolatile + FileGetLen,
1250 {
1251     let size = image.get_len().map_err(LoadImageError::GetLen)?;
1252 
1253     if size > usize::MAX as u64 || size > max_size {
1254         return Err(LoadImageError::ImageSizeTooLarge(size));
1255     }
1256 
1257     // This is safe due to the bounds check above.
1258     let size = size as usize;
1259 
1260     let guest_slice = guest_mem
1261         .get_slice_at_addr(guest_addr, size)
1262         .map_err(LoadImageError::GuestMemorySlice)?;
1263     image
1264         .read_exact_at_volatile(guest_slice, 0)
1265         .map_err(LoadImageError::ReadToMemory)?;
1266 
1267     Ok(size)
1268 }
1269 
1270 /// Load an image from a file into guest memory at the highest possible address.
1271 ///
1272 /// # Arguments
1273 ///
1274 /// * `guest_mem` - The memory to be used by the guest.
1275 /// * `image` - The file containing the image to be loaded.
1276 /// * `min_guest_addr` - The minimum address of the start of the image.
1277 /// * `max_guest_addr` - The address to load the last byte of the image.
1278 /// * `region_filter` - The optional filter function for determining if the given guest memory
1279 ///   region is suitable for loading the image into it.
1280 /// * `align` - The minimum alignment of the start address of the image in bytes (must be a power of
1281 ///   two).
1282 ///
1283 /// The guest address and size in bytes of the loaded image are returned.
load_image_high<F>( guest_mem: &GuestMemory, image: &mut F, min_guest_addr: GuestAddress, max_guest_addr: GuestAddress, region_filter: Option<fn(&MemoryRegionInformation) -> bool>, align: u64, ) -> Result<(GuestAddress, usize), LoadImageError> where F: FileReadWriteAtVolatile + FileGetLen,1284 pub fn load_image_high<F>(
1285     guest_mem: &GuestMemory,
1286     image: &mut F,
1287     min_guest_addr: GuestAddress,
1288     max_guest_addr: GuestAddress,
1289     region_filter: Option<fn(&MemoryRegionInformation) -> bool>,
1290     align: u64,
1291 ) -> Result<(GuestAddress, usize), LoadImageError>
1292 where
1293     F: FileReadWriteAtVolatile + FileGetLen,
1294 {
1295     if !align.is_power_of_two() {
1296         return Err(LoadImageError::BadAlignment(align));
1297     }
1298 
1299     let max_size = max_guest_addr.offset_from(min_guest_addr) & !(align - 1);
1300     let size = image.get_len().map_err(LoadImageError::GetLen)?;
1301 
1302     if size == 0 {
1303         return Err(LoadImageError::ZeroSizedImage);
1304     }
1305 
1306     if size > usize::MAX as u64 || size > max_size {
1307         return Err(LoadImageError::ImageSizeTooLarge(size));
1308     }
1309 
1310     // Sort the list of guest memory regions by address so we can iterate over them in reverse order
1311     // (high to low).
1312     let mut regions: Vec<_> = guest_mem
1313         .regions()
1314         .filter(region_filter.unwrap_or(|_| true))
1315         .collect();
1316     regions.sort_unstable_by(|a, b| a.guest_addr.cmp(&b.guest_addr));
1317 
1318     // Find the highest valid address inside a guest memory region that satisfies the requested
1319     // alignment and min/max address requirements while having enough space for the image.
1320     let guest_addr = regions
1321         .into_iter()
1322         .rev()
1323         .filter_map(|r| {
1324             // Highest address within this region.
1325             let rgn_max_addr = r
1326                 .guest_addr
1327                 .checked_add((r.size as u64).checked_sub(1)?)?
1328                 .min(max_guest_addr);
1329             // Lowest aligned address within this region.
1330             let rgn_start_aligned = r.guest_addr.align(align)?;
1331             // Hypothetical address of the image if loaded at the end of the region.
1332             let image_addr = rgn_max_addr.checked_sub(size - 1)? & !(align - 1);
1333 
1334             // Would the image fit within the region?
1335             if image_addr >= rgn_start_aligned {
1336                 Some(image_addr)
1337             } else {
1338                 None
1339             }
1340         })
1341         .find(|&addr| addr >= min_guest_addr)
1342         .ok_or(LoadImageError::NoSuitableMemoryRegion)?;
1343 
1344     // This is safe due to the bounds check above.
1345     let size = size as usize;
1346 
1347     let guest_slice = guest_mem
1348         .get_slice_at_addr(guest_addr, size)
1349         .map_err(LoadImageError::GuestMemorySlice)?;
1350     image
1351         .read_exact_at_volatile(guest_slice, 0)
1352         .map_err(LoadImageError::ReadToMemory)?;
1353 
1354     Ok((guest_addr, size))
1355 }
1356 
1357 /// SMBIOS table configuration
1358 #[derive(Clone, Debug, Default, Serialize, Deserialize, FromKeyValues, PartialEq, Eq)]
1359 #[serde(deny_unknown_fields, rename_all = "kebab-case")]
1360 pub struct SmbiosOptions {
1361     /// BIOS vendor name.
1362     pub bios_vendor: Option<String>,
1363 
1364     /// BIOS version number (free-form string).
1365     pub bios_version: Option<String>,
1366 
1367     /// System manufacturer name.
1368     pub manufacturer: Option<String>,
1369 
1370     /// System product name.
1371     pub product_name: Option<String>,
1372 
1373     /// System serial number (free-form string).
1374     pub serial_number: Option<String>,
1375 
1376     /// System UUID.
1377     pub uuid: Option<Uuid>,
1378 
1379     /// Additional OEM strings to add to SMBIOS table.
1380     #[serde(default)]
1381     pub oem_strings: Vec<String>,
1382 }
1383 
1384 #[cfg(test)]
1385 mod tests {
1386     use serde_keyvalue::from_key_values;
1387     use tempfile::tempfile;
1388 
1389     use super::*;
1390 
1391     #[test]
parse_pstore()1392     fn parse_pstore() {
1393         let res: Pstore = from_key_values("path=/some/path,size=16384").unwrap();
1394         assert_eq!(
1395             res,
1396             Pstore {
1397                 path: "/some/path".into(),
1398                 size: 16384,
1399             }
1400         );
1401 
1402         let res = from_key_values::<Pstore>("path=/some/path");
1403         assert!(res.is_err());
1404 
1405         let res = from_key_values::<Pstore>("size=16384");
1406         assert!(res.is_err());
1407 
1408         let res = from_key_values::<Pstore>("");
1409         assert!(res.is_err());
1410     }
1411 
1412     #[test]
deserialize_cpuset_serde_kv()1413     fn deserialize_cpuset_serde_kv() {
1414         let res: CpuSet = from_key_values("[0,4,7]").unwrap();
1415         assert_eq!(res, CpuSet::new(vec![0, 4, 7]));
1416 
1417         let res: CpuSet = from_key_values("[9-12]").unwrap();
1418         assert_eq!(res, CpuSet::new(vec![9, 10, 11, 12]));
1419 
1420         let res: CpuSet = from_key_values("[0,4,7,9-12,15]").unwrap();
1421         assert_eq!(res, CpuSet::new(vec![0, 4, 7, 9, 10, 11, 12, 15]));
1422     }
1423 
1424     #[test]
deserialize_serialize_cpuset_json()1425     fn deserialize_serialize_cpuset_json() {
1426         let json_str = "[0,4,7]";
1427         let cpuset = CpuSet::new(vec![0, 4, 7]);
1428         let res: CpuSet = serde_json::from_str(json_str).unwrap();
1429         assert_eq!(res, cpuset);
1430         assert_eq!(serde_json::to_string(&cpuset).unwrap(), json_str);
1431 
1432         let json_str = r#"["9-12"]"#;
1433         let cpuset = CpuSet::new(vec![9, 10, 11, 12]);
1434         let res: CpuSet = serde_json::from_str(json_str).unwrap();
1435         assert_eq!(res, cpuset);
1436         assert_eq!(serde_json::to_string(&cpuset).unwrap(), json_str);
1437 
1438         let json_str = r#"[0,4,7,"9-12",15]"#;
1439         let cpuset = CpuSet::new(vec![0, 4, 7, 9, 10, 11, 12, 15]);
1440         let res: CpuSet = serde_json::from_str(json_str).unwrap();
1441         assert_eq!(res, cpuset);
1442         assert_eq!(serde_json::to_string(&cpuset).unwrap(), json_str);
1443     }
1444 
1445     #[test]
load_image_high_max_4g()1446     fn load_image_high_max_4g() {
1447         let mem = GuestMemory::new(&[
1448             (GuestAddress(0x0000_0000), 0x4000_0000), // 0x00000000..0x40000000
1449             (GuestAddress(0x8000_0000), 0x4000_0000), // 0x80000000..0xC0000000
1450         ])
1451         .unwrap();
1452 
1453         const TEST_IMAGE_SIZE: u64 = 1234;
1454         let mut test_image = tempfile().unwrap();
1455         test_image.set_len(TEST_IMAGE_SIZE).unwrap();
1456 
1457         const TEST_ALIGN: u64 = 0x8000;
1458         let (addr, size) = load_image_high(
1459             &mem,
1460             &mut test_image,
1461             GuestAddress(0x8000),
1462             GuestAddress(0xFFFF_FFFF), // max_guest_addr beyond highest guest memory region
1463             None,
1464             TEST_ALIGN,
1465         )
1466         .unwrap();
1467 
1468         assert_eq!(addr, GuestAddress(0xBFFF_8000));
1469         assert_eq!(addr.offset() % TEST_ALIGN, 0);
1470         assert_eq!(size, TEST_IMAGE_SIZE as usize);
1471     }
1472 }
1473