• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2022 The ChromiumOS Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #[cfg(target_os = "android")]
6 mod android;
7 pub mod cmdline;
8 pub mod config;
9 mod device_helpers;
10 pub(crate) mod ext2;
11 #[cfg(feature = "gpu")]
12 pub(crate) mod gpu;
13 #[cfg(feature = "pci-hotplug")]
14 pub(crate) mod jail_warden;
15 #[cfg(feature = "pci-hotplug")]
16 pub(crate) mod pci_hotplug_helpers;
17 #[cfg(feature = "pci-hotplug")]
18 pub(crate) mod pci_hotplug_manager;
19 mod vcpu;
20 
21 #[cfg(all(feature = "pvclock", target_arch = "aarch64"))]
22 use std::arch::asm;
23 use std::cmp::max;
24 use std::collections::BTreeMap;
25 use std::collections::BTreeSet;
26 #[cfg(feature = "registered_events")]
27 use std::collections::HashMap;
28 #[cfg(feature = "registered_events")]
29 use std::collections::HashSet;
30 use std::convert::TryInto;
31 use std::ffi::CString;
32 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
33 use std::fs::create_dir_all;
34 use std::fs::File;
35 use std::fs::OpenOptions;
36 #[cfg(feature = "registered_events")]
37 use std::hash::Hash;
38 use std::io::stdin;
39 use std::iter;
40 use std::mem;
41 #[cfg(target_arch = "x86_64")]
42 use std::ops::RangeInclusive;
43 use std::os::unix::process::ExitStatusExt;
44 use std::path::Path;
45 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
46 use std::path::PathBuf;
47 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
48 use std::process;
49 #[cfg(feature = "registered_events")]
50 use std::rc::Rc;
51 use std::sync::mpsc;
52 use std::sync::Arc;
53 use std::sync::Barrier;
54 use std::thread::JoinHandle;
55 
56 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
57 use aarch64::AArch64 as Arch;
58 use acpi_tables::sdt::SDT;
59 use anyhow::anyhow;
60 use anyhow::bail;
61 use anyhow::Context;
62 use anyhow::Result;
63 use arch::DtbOverlay;
64 use arch::IrqChipArch;
65 use arch::LinuxArch;
66 use arch::RunnableLinuxVm;
67 use arch::VcpuAffinity;
68 use arch::VcpuArch;
69 use arch::VirtioDeviceStub;
70 use arch::VmArch;
71 use arch::VmComponents;
72 use arch::VmImage;
73 use argh::FromArgs;
74 use base::ReadNotifier;
75 #[cfg(feature = "balloon")]
76 use base::UnixSeqpacket;
77 use base::UnixSeqpacketListener;
78 use base::UnlinkUnixSeqpacketListener;
79 use base::*;
80 use cros_async::Executor;
81 use device_helpers::*;
82 use devices::create_devices_worker_thread;
83 use devices::serial_device::SerialHardware;
84 #[cfg(all(feature = "pvclock", target_arch = "x86_64"))]
85 use devices::tsc::get_tsc_sync_mitigations;
86 use devices::vfio::VfioContainerManager;
87 #[cfg(feature = "gpu")]
88 use devices::virtio;
89 #[cfg(any(feature = "video-decoder", feature = "video-encoder"))]
90 use devices::virtio::device_constants::video::VideoDeviceType;
91 #[cfg(feature = "gpu")]
92 use devices::virtio::gpu::EventDevice;
93 #[cfg(target_arch = "x86_64")]
94 use devices::virtio::memory_mapper::MemoryMapper;
95 use devices::virtio::memory_mapper::MemoryMapperTrait;
96 use devices::virtio::vhost::user::VhostUserConnectionTrait;
97 use devices::virtio::vhost::user::VhostUserListener;
98 #[cfg(feature = "balloon")]
99 use devices::virtio::BalloonFeatures;
100 #[cfg(feature = "pci-hotplug")]
101 use devices::virtio::NetParameters;
102 #[cfg(feature = "pci-hotplug")]
103 use devices::virtio::NetParametersMode;
104 use devices::virtio::VirtioDevice;
105 use devices::virtio::VirtioDeviceType;
106 use devices::Bus;
107 use devices::BusDeviceObj;
108 use devices::BusType;
109 use devices::CoIommuDev;
110 #[cfg(feature = "usb")]
111 use devices::DeviceProvider;
112 #[cfg(target_arch = "x86_64")]
113 use devices::HotPlugBus;
114 #[cfg(target_arch = "x86_64")]
115 use devices::HotPlugKey;
116 use devices::IommuDevType;
117 use devices::IrqEventIndex;
118 use devices::IrqEventSource;
119 #[cfg(feature = "pci-hotplug")]
120 use devices::NetResourceCarrier;
121 #[cfg(target_arch = "x86_64")]
122 use devices::PciAddress;
123 #[cfg(target_arch = "x86_64")]
124 use devices::PciBridge;
125 use devices::PciDevice;
126 #[cfg(target_arch = "x86_64")]
127 use devices::PciMmioMapper;
128 #[cfg(target_arch = "x86_64")]
129 use devices::PciRoot;
130 #[cfg(target_arch = "x86_64")]
131 use devices::PciRootCommand;
132 #[cfg(target_arch = "x86_64")]
133 use devices::PcieDownstreamPort;
134 #[cfg(target_arch = "x86_64")]
135 use devices::PcieHostPort;
136 #[cfg(target_arch = "x86_64")]
137 use devices::PcieRootPort;
138 #[cfg(target_arch = "x86_64")]
139 use devices::PcieUpstreamPort;
140 use devices::PvPanicCode;
141 use devices::PvPanicPciDevice;
142 #[cfg(feature = "pci-hotplug")]
143 use devices::ResourceCarrier;
144 use devices::StubPciDevice;
145 use devices::VirtioPciDevice;
146 #[cfg(feature = "usb")]
147 use devices::XhciController;
148 #[cfg(feature = "gpu")]
149 use gpu::*;
150 #[cfg(target_arch = "riscv64")]
151 use hypervisor::CpuConfigRiscv64;
152 #[cfg(target_arch = "x86_64")]
153 use hypervisor::CpuConfigX86_64;
154 use hypervisor::Hypervisor;
155 use hypervisor::HypervisorCap;
156 use hypervisor::MemCacheType;
157 use hypervisor::ProtectionType;
158 use hypervisor::Vm;
159 use hypervisor::VmCap;
160 use jail::*;
161 #[cfg(feature = "pci-hotplug")]
162 use jail_warden::JailWarden;
163 #[cfg(feature = "pci-hotplug")]
164 use jail_warden::JailWardenImpl;
165 #[cfg(feature = "pci-hotplug")]
166 use jail_warden::PermissiveJailWarden;
167 use libc;
168 use metrics::MetricsController;
169 use minijail::Minijail;
170 #[cfg(feature = "pci-hotplug")]
171 use pci_hotplug_manager::PciHotPlugManager;
172 use resources::AddressRange;
173 use resources::Alloc;
174 use resources::SystemAllocator;
175 #[cfg(target_arch = "riscv64")]
176 use riscv64::Riscv64 as Arch;
177 use rutabaga_gfx::RutabagaGralloc;
178 use rutabaga_gfx::RutabagaGrallocBackendFlags;
179 use smallvec::SmallVec;
180 #[cfg(feature = "swap")]
181 use swap::SwapController;
182 use sync::Condvar;
183 use sync::Mutex;
184 use vm_control::api::VmMemoryClient;
185 use vm_control::*;
186 use vm_memory::FileBackedMappingParameters;
187 use vm_memory::GuestAddress;
188 use vm_memory::GuestMemory;
189 use vm_memory::MemoryPolicy;
190 use vm_memory::MemoryRegionOptions;
191 #[cfg(target_arch = "x86_64")]
192 use x86_64::X8664arch as Arch;
193 
194 use crate::crosvm::config::Config;
195 use crate::crosvm::config::Executable;
196 use crate::crosvm::config::HypervisorKind;
197 use crate::crosvm::config::InputDeviceOption;
198 use crate::crosvm::config::IrqChipKind;
199 use crate::crosvm::config::DEFAULT_TOUCH_DEVICE_HEIGHT;
200 use crate::crosvm::config::DEFAULT_TOUCH_DEVICE_WIDTH;
201 #[cfg(feature = "gdb")]
202 use crate::crosvm::gdb::gdb_thread;
203 #[cfg(feature = "gdb")]
204 use crate::crosvm::gdb::GdbStub;
205 #[cfg(target_arch = "x86_64")]
206 use crate::crosvm::ratelimit::Ratelimit;
207 use crate::crosvm::sys::cmdline::DevicesCommand;
208 use crate::crosvm::sys::config::SharedDir;
209 use crate::crosvm::sys::config::SharedDirKind;
210 use crate::crosvm::sys::platform::vcpu::VcpuPidTid;
211 
212 const KVM_PATH: &str = "/dev/kvm";
213 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
214 #[cfg(feature = "geniezone")]
215 const GENIEZONE_PATH: &str = "/dev/gzvm";
216 #[cfg(all(any(target_arch = "arm", target_arch = "aarch64"), feature = "gunyah"))]
217 static GUNYAH_PATH: &str = "/dev/gunyah";
218 
create_virtio_devices( cfg: &Config, vm: &mut impl VmArch, resources: &mut SystemAllocator, add_control_tube: &mut impl FnMut(AnyControlTube), #[cfg_attr(not(feature = "gpu"), allow(unused_variables))] vm_evt_wrtube: &SendTube, #[cfg(feature = "balloon")] balloon_inflate_tube: Option<Tube>, worker_process_pids: &mut BTreeSet<Pid>, #[cfg(feature = "gpu")] render_server_fd: Option<SafeDescriptor>, #[cfg(feature = "gpu")] has_vfio_gfx_device: bool, #[cfg(feature = "registered_events")] registered_evt_q: &SendTube, ) -> DeviceResult<Vec<VirtioDeviceStub>>219 fn create_virtio_devices(
220     cfg: &Config,
221     vm: &mut impl VmArch,
222     resources: &mut SystemAllocator,
223     add_control_tube: &mut impl FnMut(AnyControlTube),
224     #[cfg_attr(not(feature = "gpu"), allow(unused_variables))] vm_evt_wrtube: &SendTube,
225     #[cfg(feature = "balloon")] balloon_inflate_tube: Option<Tube>,
226     worker_process_pids: &mut BTreeSet<Pid>,
227     #[cfg(feature = "gpu")] render_server_fd: Option<SafeDescriptor>,
228     #[cfg(feature = "gpu")] has_vfio_gfx_device: bool,
229     #[cfg(feature = "registered_events")] registered_evt_q: &SendTube,
230 ) -> DeviceResult<Vec<VirtioDeviceStub>> {
231     let mut devs = Vec::new();
232 
233     #[cfg(any(feature = "gpu", feature = "video-decoder", feature = "video-encoder"))]
234     let mut resource_bridges = Vec::<Tube>::new();
235 
236     if !cfg.wayland_socket_paths.is_empty() {
237         #[cfg_attr(not(feature = "gpu"), allow(unused_mut))]
238         let mut wl_resource_bridge = None::<Tube>;
239 
240         #[cfg(feature = "gpu")]
241         {
242             if cfg.gpu_parameters.is_some() {
243                 let (wl_socket, gpu_socket) = Tube::pair().context("failed to create tube")?;
244                 resource_bridges.push(gpu_socket);
245                 wl_resource_bridge = Some(wl_socket);
246             }
247         }
248 
249         devs.push(create_wayland_device(
250             cfg.protection_type,
251             cfg.jail_config.as_ref(),
252             &cfg.wayland_socket_paths,
253             wl_resource_bridge,
254         )?);
255     }
256 
257     #[cfg(all(feature = "media", feature = "video-decoder"))]
258     let media_adapter_cfg = cfg
259         .media_decoder
260         .iter()
261         .map(|config| {
262             let (video_tube, gpu_tube) =
263                 Tube::pair().expect("failed to create tube for media adapter");
264             resource_bridges.push(gpu_tube);
265             (video_tube, config.backend)
266         })
267         .collect::<Vec<_>>();
268 
269     #[cfg(feature = "video-decoder")]
270     let video_dec_cfg = cfg
271         .video_dec
272         .iter()
273         .map(|config| {
274             let (video_tube, gpu_tube) =
275                 Tube::pair().expect("failed to create tube for video decoder");
276             resource_bridges.push(gpu_tube);
277             (video_tube, config.backend)
278         })
279         .collect::<Vec<_>>();
280 
281     #[cfg(feature = "video-encoder")]
282     let video_enc_cfg = cfg
283         .video_enc
284         .iter()
285         .map(|config| {
286             let (video_tube, gpu_tube) =
287                 Tube::pair().expect("failed to create tube for video encoder");
288             resource_bridges.push(gpu_tube);
289             (video_tube, config.backend)
290         })
291         .collect::<Vec<_>>();
292 
293     #[cfg(feature = "gpu")]
294     {
295         if let Some(gpu_parameters) = &cfg.gpu_parameters {
296             let mut event_devices = Vec::new();
297             if cfg.display_window_mouse {
298                 let display_param = if gpu_parameters.display_params.is_empty() {
299                     Default::default()
300                 } else {
301                     gpu_parameters.display_params[0].clone()
302                 };
303                 let (gpu_display_w, gpu_display_h) = display_param.get_virtual_display_size();
304 
305                 let (event_device_socket, virtio_dev_socket) =
306                     StreamChannel::pair(BlockingMode::Nonblocking, FramingMode::Byte)
307                         .context("failed to create socket")?;
308                 let mut multi_touch_width = gpu_display_w;
309                 let mut multi_touch_height = gpu_display_h;
310                 let mut multi_touch_name = None;
311                 for input in &cfg.virtio_input {
312                     if let InputDeviceOption::MultiTouch {
313                         width,
314                         height,
315                         name,
316                         ..
317                     } = input
318                     {
319                         if let Some(width) = width {
320                             multi_touch_width = *width;
321                         }
322                         if let Some(height) = height {
323                             multi_touch_height = *height;
324                         }
325                         if let Some(name) = name {
326                             multi_touch_name = Some(name.as_str());
327                         }
328                         break;
329                     }
330                 }
331                 let dev = virtio::input::new_multi_touch(
332                     // u32::MAX is the least likely to collide with the indices generated above for
333                     // the multi_touch options, which begin at 0.
334                     u32::MAX,
335                     virtio_dev_socket,
336                     multi_touch_width,
337                     multi_touch_height,
338                     multi_touch_name,
339                     virtio::base_features(cfg.protection_type),
340                 )
341                 .context("failed to set up mouse device")?;
342                 devs.push(VirtioDeviceStub {
343                     dev: Box::new(dev),
344                     jail: simple_jail(cfg.jail_config.as_ref(), "input_device")?,
345                 });
346                 event_devices.push(EventDevice::touchscreen(event_device_socket));
347             }
348             if cfg.display_window_keyboard {
349                 let (event_device_socket, virtio_dev_socket) =
350                     StreamChannel::pair(BlockingMode::Nonblocking, FramingMode::Byte)
351                         .context("failed to create socket")?;
352                 let dev = virtio::input::new_keyboard(
353                     // u32::MAX is the least likely to collide with the indices generated above for
354                     // the multi_touch options, which begin at 0.
355                     u32::MAX,
356                     virtio_dev_socket,
357                     virtio::base_features(cfg.protection_type),
358                 )
359                 .context("failed to set up keyboard device")?;
360                 devs.push(VirtioDeviceStub {
361                     dev: Box::new(dev),
362                     jail: simple_jail(cfg.jail_config.as_ref(), "input_device")?,
363                 });
364                 event_devices.push(EventDevice::keyboard(event_device_socket));
365             }
366 
367             let (gpu_control_host_tube, gpu_control_device_tube) =
368                 Tube::pair().context("failed to create gpu tube")?;
369             add_control_tube(DeviceControlTube::Gpu(gpu_control_host_tube).into());
370             devs.push(create_gpu_device(
371                 cfg,
372                 vm_evt_wrtube,
373                 gpu_control_device_tube,
374                 resource_bridges,
375                 render_server_fd,
376                 has_vfio_gfx_device,
377                 event_devices,
378             )?);
379         }
380     }
381 
382     for (_, param) in cfg
383         .serial_parameters
384         .iter()
385         .filter(|(_k, v)| v.hardware == SerialHardware::VirtioConsole)
386     {
387         let dev =
388             param.create_virtio_device_and_jail(cfg.protection_type, cfg.jail_config.as_ref())?;
389         devs.push(dev);
390     }
391 
392     for disk in &cfg.disks {
393         let (disk_host_tube, disk_device_tube) = Tube::pair().context("failed to create tube")?;
394         add_control_tube(DeviceControlTube::Disk(disk_host_tube).into());
395         let disk_config = DiskConfig::new(disk, Some(disk_device_tube));
396         devs.push(
397             disk_config
398                 .create_virtio_device_and_jail(cfg.protection_type, cfg.jail_config.as_ref())?,
399         );
400     }
401 
402     if !cfg.scsis.is_empty() {
403         let scsi_config = ScsiConfig(&cfg.scsis);
404         devs.push(
405             scsi_config
406                 .create_virtio_device_and_jail(cfg.protection_type, cfg.jail_config.as_ref())?,
407         );
408     }
409 
410     for (index, pmem_disk) in cfg.pmems.iter().enumerate() {
411         let (pmem_host_tube, pmem_device_tube) = Tube::pair().context("failed to create tube")?;
412         add_control_tube(TaggedControlTube::VmMsync(pmem_host_tube).into());
413         devs.push(create_pmem_device(
414             cfg.protection_type,
415             cfg.jail_config.as_ref(),
416             vm,
417             resources,
418             pmem_disk,
419             index,
420             pmem_device_tube,
421         )?);
422     }
423 
424     for (index, pmem_ext2) in cfg.pmem_ext2.iter().enumerate() {
425         // Prepare a `VmMemoryClient` for pmem-ext2 device to send a request for mmap() and memory
426         // registeration.
427         let (pmem_ext2_host_tube, pmem_ext2_device_tube) =
428             Tube::pair().context("failed to create tube")?;
429         let vm_memory_client = VmMemoryClient::new(pmem_ext2_device_tube);
430         add_control_tube(
431             VmMemoryTube {
432                 tube: pmem_ext2_host_tube,
433                 expose_with_viommu: false,
434             }
435             .into(),
436         );
437         let (pmem_host_tube, pmem_device_tube) = Tube::pair().context("failed to create tube")?;
438         add_control_tube(TaggedControlTube::VmMsync(pmem_host_tube).into());
439         devs.push(create_pmem_ext2_device(
440             cfg.protection_type,
441             cfg.jail_config.as_ref(),
442             resources,
443             pmem_ext2,
444             index,
445             vm_memory_client,
446             pmem_device_tube,
447             worker_process_pids,
448         )?);
449     }
450 
451     if cfg.rng {
452         devs.push(create_rng_device(
453             cfg.protection_type,
454             cfg.jail_config.as_ref(),
455         )?);
456     }
457 
458     #[cfg(feature = "pvclock")]
459     if cfg.pvclock {
460         // pvclock gets a tube for handling suspend/resume requests from the main thread.
461         let (host_suspend_tube, suspend_tube) = Tube::pair().context("failed to create tube")?;
462         add_control_tube(DeviceControlTube::PvClock(host_suspend_tube).into());
463 
464         let frequency: u64;
465         #[cfg(target_arch = "x86_64")]
466         {
467             let tsc_state = devices::tsc::tsc_state()?;
468             let tsc_sync_mitigations =
469                 get_tsc_sync_mitigations(&tsc_state, cfg.vcpu_count.unwrap_or(1));
470             if tsc_state.core_grouping.size() > 1 {
471                 // Host TSCs are not in sync. Log what mitigations are applied.
472                 warn!(
473                     "Host TSCs are not in sync, applying the following mitigations: {:?}",
474                     tsc_sync_mitigations
475                 );
476             }
477             frequency = tsc_state.frequency;
478         }
479         #[cfg(target_arch = "aarch64")]
480         {
481             let mut x: u64;
482             // SAFETY: This instruction have no side effect apart from storing the current timestamp
483             //         frequency into the specified register.
484             unsafe {
485                 asm!("mrs {x}, cntfrq_el0",
486                     x = out(reg) x,
487                 );
488             }
489             frequency = x;
490 
491             // If unset, KVM defaults to an offset that is calculated from VM boot time. Explicitly
492             // set it to zero on boot. When updating the offset, we always set it to the total
493             // amount of time the VM has been suspended.
494             vm.set_counter_offset(0)
495                 .context("failed to set up pvclock")?;
496         }
497         let dev = create_pvclock_device(
498             cfg.protection_type,
499             cfg.jail_config.as_ref(),
500             frequency,
501             suspend_tube,
502         )?;
503         devs.push(dev);
504         info!("virtio-pvclock is enabled for this vm");
505     }
506 
507     #[cfg(feature = "vtpm")]
508     {
509         if cfg.vtpm_proxy {
510             devs.push(create_vtpm_proxy_device(
511                 cfg.protection_type,
512                 cfg.jail_config.as_ref(),
513             )?);
514         }
515     }
516 
517     let mut keyboard_idx = 0;
518     let mut mouse_idx = 0;
519     let mut rotary_idx = 0;
520     let mut switches_idx = 0;
521     let mut multi_touch_idx = 0;
522     let mut single_touch_idx = 0;
523     let mut trackpad_idx = 0;
524     let mut multi_touch_trackpad_idx = 0;
525     let mut custom_idx = 0;
526     for input in &cfg.virtio_input {
527         let input_dev = match input {
528             InputDeviceOption::Evdev { path } => create_vinput_device(
529                 cfg.protection_type,
530                 cfg.jail_config.as_ref(),
531                 path.as_path(),
532             )?,
533             InputDeviceOption::Keyboard { path } => {
534                 let dev = create_keyboard_device(
535                     cfg.protection_type,
536                     cfg.jail_config.as_ref(),
537                     path.as_path(),
538                     keyboard_idx,
539                 )?;
540                 keyboard_idx += 1;
541                 dev
542             }
543             InputDeviceOption::Mouse { path } => {
544                 let dev = create_mouse_device(
545                     cfg.protection_type,
546                     cfg.jail_config.as_ref(),
547                     path.as_path(),
548                     mouse_idx,
549                 )?;
550                 mouse_idx += 1;
551                 dev
552             }
553             InputDeviceOption::MultiTouch {
554                 path,
555                 width,
556                 height,
557                 name,
558             } => {
559                 let mut width = *width;
560                 let mut height = *height;
561                 if multi_touch_idx == 0 {
562                     if width.is_none() {
563                         width = cfg.display_input_width;
564                     }
565                     if height.is_none() {
566                         height = cfg.display_input_height;
567                     }
568                 }
569                 let dev = create_multi_touch_device(
570                     cfg.protection_type,
571                     cfg.jail_config.as_ref(),
572                     path.as_path(),
573                     width.unwrap_or(DEFAULT_TOUCH_DEVICE_WIDTH),
574                     height.unwrap_or(DEFAULT_TOUCH_DEVICE_HEIGHT),
575                     name.as_deref(),
576                     multi_touch_idx,
577                 )?;
578                 multi_touch_idx += 1;
579                 dev
580             }
581             InputDeviceOption::Rotary { path } => {
582                 let dev = create_rotary_device(
583                     cfg.protection_type,
584                     cfg.jail_config.as_ref(),
585                     path.as_path(),
586                     rotary_idx,
587                 )?;
588                 rotary_idx += 1;
589                 dev
590             }
591             InputDeviceOption::SingleTouch {
592                 path,
593                 width,
594                 height,
595                 name,
596             } => {
597                 let mut width = *width;
598                 let mut height = *height;
599                 if single_touch_idx == 0 {
600                     if width.is_none() {
601                         width = cfg.display_input_width;
602                     }
603                     if height.is_none() {
604                         height = cfg.display_input_height;
605                     }
606                 }
607                 let dev = create_single_touch_device(
608                     cfg.protection_type,
609                     cfg.jail_config.as_ref(),
610                     path.as_path(),
611                     width.unwrap_or(DEFAULT_TOUCH_DEVICE_WIDTH),
612                     height.unwrap_or(DEFAULT_TOUCH_DEVICE_HEIGHT),
613                     name.as_deref(),
614                     single_touch_idx,
615                 )?;
616                 single_touch_idx += 1;
617                 dev
618             }
619             InputDeviceOption::Switches { path } => {
620                 let dev = create_switches_device(
621                     cfg.protection_type,
622                     cfg.jail_config.as_ref(),
623                     path.as_path(),
624                     switches_idx,
625                 )?;
626                 switches_idx += 1;
627                 dev
628             }
629             InputDeviceOption::Trackpad {
630                 path,
631                 width,
632                 height,
633                 name,
634             } => {
635                 let dev = create_trackpad_device(
636                     cfg.protection_type,
637                     cfg.jail_config.as_ref(),
638                     path.as_path(),
639                     width.unwrap_or(DEFAULT_TOUCH_DEVICE_WIDTH),
640                     height.unwrap_or(DEFAULT_TOUCH_DEVICE_HEIGHT),
641                     name.as_deref(),
642                     trackpad_idx,
643                 )?;
644                 trackpad_idx += 1;
645                 dev
646             }
647             InputDeviceOption::MultiTouchTrackpad {
648                 path,
649                 width,
650                 height,
651                 name,
652             } => {
653                 let dev = create_multitouch_trackpad_device(
654                     cfg.protection_type,
655                     cfg.jail_config.as_ref(),
656                     path.as_path(),
657                     width.unwrap_or(DEFAULT_TOUCH_DEVICE_WIDTH),
658                     height.unwrap_or(DEFAULT_TOUCH_DEVICE_HEIGHT),
659                     name.as_deref(),
660                     multi_touch_trackpad_idx,
661                 )?;
662                 multi_touch_trackpad_idx += 1;
663                 dev
664             }
665             InputDeviceOption::Custom { path, config_path } => {
666                 let dev = create_custom_device(
667                     cfg.protection_type,
668                     cfg.jail_config.as_ref(),
669                     path.as_path(),
670                     custom_idx,
671                     config_path.clone(),
672                 )?;
673                 custom_idx += 1;
674                 dev
675             }
676         };
677         devs.push(input_dev);
678     }
679 
680     #[cfg(feature = "balloon")]
681     if cfg.balloon {
682         let balloon_device_tube = if let Some(ref path) = cfg.balloon_control {
683             Tube::try_from(UnixSeqpacket::connect(path).with_context(|| {
684                 format!(
685                     "failed to connect to balloon control socket {}",
686                     path.display(),
687                 )
688             })?)?
689         } else {
690             // Balloon gets a special socket so balloon requests can be forwarded
691             // from the main process.
692             let (host, device) = Tube::pair().context("failed to create tube")?;
693             add_control_tube(DeviceControlTube::Balloon(host).into());
694             device
695         };
696 
697         let balloon_features = (cfg.balloon_page_reporting as u64)
698             << BalloonFeatures::PageReporting as u64
699             | (cfg.balloon_ws_reporting as u64) << BalloonFeatures::WSReporting as u64;
700 
701         let init_balloon_size = if let Some(init_memory) = cfg.init_memory {
702             let init_memory_bytes = init_memory.saturating_mul(1024 * 1024);
703             let total_memory_bytes = vm.get_memory().memory_size();
704 
705             if init_memory_bytes > total_memory_bytes {
706                 bail!(
707                     "initial memory {} cannot be greater than total memory {}",
708                     init_memory,
709                     total_memory_bytes / (1024 * 1024),
710                 );
711             }
712 
713             // The initial balloon size is the total memory size minus the initial memory size.
714             total_memory_bytes - init_memory_bytes
715         } else {
716             // No --init-mem specified; start with balloon completely deflated.
717             0
718         };
719 
720         // The balloon device also needs a tube to communicate back to the main process to
721         // handle remapping memory dynamically.
722         let (dynamic_mapping_host_tube, dynamic_mapping_device_tube) =
723             Tube::pair().context("failed to create tube")?;
724         add_control_tube(
725             VmMemoryTube {
726                 tube: dynamic_mapping_host_tube,
727                 expose_with_viommu: false,
728             }
729             .into(),
730         );
731 
732         devs.push(create_balloon_device(
733             cfg.protection_type,
734             cfg.jail_config.as_ref(),
735             balloon_device_tube,
736             balloon_inflate_tube,
737             init_balloon_size,
738             VmMemoryClient::new(dynamic_mapping_device_tube),
739             balloon_features,
740             #[cfg(feature = "registered_events")]
741             Some(
742                 registered_evt_q
743                     .try_clone()
744                     .context("failed to clone registered_evt_q tube")?,
745             ),
746             cfg.balloon_ws_num_bins,
747         )?);
748     }
749 
750     #[cfg(feature = "net")]
751     for opt in &cfg.net {
752         let dev =
753             opt.create_virtio_device_and_jail(cfg.protection_type, cfg.jail_config.as_ref())?;
754         devs.push(dev);
755     }
756 
757     #[cfg(feature = "audio")]
758     {
759         for (card_index, virtio_snd) in cfg.virtio_snds.iter().enumerate() {
760             let (snd_host_tube, snd_device_tube) =
761                 Tube::pair().context("failed to create tube for snd")?;
762             add_control_tube(DeviceControlTube::Snd(snd_host_tube).into());
763             let mut snd_params = virtio_snd.clone();
764             snd_params.card_index = card_index;
765             devs.push(create_virtio_snd_device(
766                 cfg.protection_type,
767                 cfg.jail_config.as_ref(),
768                 snd_params,
769                 snd_device_tube,
770             )?);
771         }
772     }
773 
774     #[cfg(any(target_os = "android", target_os = "linux"))]
775     #[cfg(feature = "media")]
776     {
777         for v4l2_device in &cfg.v4l2_proxy {
778             devs.push(create_v4l2_device(cfg.protection_type, v4l2_device)?);
779         }
780     }
781 
782     #[cfg(feature = "media")]
783     if cfg.simple_media_device {
784         devs.push(create_simple_media_device(cfg.protection_type)?);
785     }
786 
787     #[cfg(all(feature = "media", feature = "video-decoder"))]
788     {
789         for (tube, backend) in media_adapter_cfg {
790             devs.push(create_virtio_media_adapter(
791                 cfg.protection_type,
792                 cfg.jail_config.as_ref(),
793                 tube,
794                 backend,
795             )?);
796         }
797     }
798 
799     #[cfg(feature = "video-decoder")]
800     {
801         for (tube, backend) in video_dec_cfg {
802             register_video_device(
803                 backend,
804                 &mut devs,
805                 tube,
806                 cfg.protection_type,
807                 cfg.jail_config.as_ref(),
808                 VideoDeviceType::Decoder,
809             )?;
810         }
811     }
812 
813     #[cfg(feature = "video-encoder")]
814     {
815         for (tube, backend) in video_enc_cfg {
816             register_video_device(
817                 backend,
818                 &mut devs,
819                 tube,
820                 cfg.protection_type,
821                 cfg.jail_config.as_ref(),
822                 VideoDeviceType::Encoder,
823             )?;
824         }
825     }
826 
827     if let Some(vsock_config) = &cfg.vsock {
828         devs.push(
829             vsock_config
830                 .create_virtio_device_and_jail(cfg.protection_type, cfg.jail_config.as_ref())?,
831         );
832     }
833 
834     #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
835     {
836         if cfg.vhost_scmi {
837             devs.push(create_vhost_scmi_device(
838                 cfg.protection_type,
839                 cfg.jail_config.as_ref(),
840                 cfg.vhost_scmi_device.clone(),
841             )?);
842         }
843     }
844     for vhost_user_fs in &cfg.vhost_user_fs {
845         devs.push(create_vhost_user_fs_device(
846             cfg.protection_type,
847             vhost_user_fs,
848         )?);
849     }
850 
851     for shared_dir in &cfg.shared_dirs {
852         let SharedDir {
853             src,
854             tag,
855             kind,
856             ugid,
857             uid_map,
858             gid_map,
859             fs_cfg,
860             p9_cfg,
861         } = shared_dir;
862 
863         let dev = match kind {
864             SharedDirKind::FS => {
865                 let (host_tube, device_tube) = Tube::pair().context("failed to create tube")?;
866                 add_control_tube(TaggedControlTube::Fs(host_tube).into());
867 
868                 create_fs_device(
869                     cfg.protection_type,
870                     cfg.jail_config.as_ref(),
871                     *ugid,
872                     uid_map,
873                     gid_map,
874                     src,
875                     tag,
876                     fs_cfg.clone(),
877                     device_tube,
878                 )?
879             }
880             SharedDirKind::P9 => create_9p_device(
881                 cfg.protection_type,
882                 cfg.jail_config.as_ref(),
883                 *ugid,
884                 uid_map,
885                 gid_map,
886                 src,
887                 tag,
888                 p9_cfg.clone(),
889             )?,
890         };
891         devs.push(dev);
892     }
893 
894     #[cfg(feature = "audio")]
895     if let Some(path) = &cfg.sound {
896         devs.push(create_sound_device(
897             path,
898             cfg.protection_type,
899             cfg.jail_config.as_ref(),
900         )?);
901     }
902 
903     for opt in &cfg.vhost_user {
904         devs.push(create_vhost_user_frontend(
905             cfg.protection_type,
906             opt,
907             cfg.vhost_user_connect_timeout_ms,
908         )?);
909     }
910 
911     Ok(devs)
912 }
913 
create_devices( cfg: &Config, vm: &mut impl VmArch, resources: &mut SystemAllocator, add_control_tube: &mut impl FnMut(AnyControlTube), vm_evt_wrtube: &SendTube, iommu_attached_endpoints: &mut BTreeMap<u32, Arc<Mutex<Box<dyn MemoryMapperTrait>>>>, #[cfg(feature = "usb")] usb_provider: DeviceProvider, #[cfg(feature = "gpu")] render_server_fd: Option<SafeDescriptor>, iova_max_addr: &mut Option<u64>, #[cfg(feature = "registered_events")] registered_evt_q: &SendTube, vfio_container_manager: &mut VfioContainerManager, worker_process_pids: &mut BTreeSet<Pid>, ) -> DeviceResult<Vec<(Box<dyn BusDeviceObj>, Option<Minijail>)>>914 fn create_devices(
915     cfg: &Config,
916     vm: &mut impl VmArch,
917     resources: &mut SystemAllocator,
918     add_control_tube: &mut impl FnMut(AnyControlTube),
919     vm_evt_wrtube: &SendTube,
920     iommu_attached_endpoints: &mut BTreeMap<u32, Arc<Mutex<Box<dyn MemoryMapperTrait>>>>,
921     #[cfg(feature = "usb")] usb_provider: DeviceProvider,
922     #[cfg(feature = "gpu")] render_server_fd: Option<SafeDescriptor>,
923     iova_max_addr: &mut Option<u64>,
924     #[cfg(feature = "registered_events")] registered_evt_q: &SendTube,
925     vfio_container_manager: &mut VfioContainerManager,
926     // Stores a set of PID of child processes that are suppose to exit cleanly.
927     worker_process_pids: &mut BTreeSet<Pid>,
928 ) -> DeviceResult<Vec<(Box<dyn BusDeviceObj>, Option<Minijail>)>> {
929     let mut devices: Vec<(Box<dyn BusDeviceObj>, Option<Minijail>)> = Vec::new();
930     #[cfg(feature = "balloon")]
931     let mut balloon_inflate_tube: Option<Tube> = None;
932     #[cfg(feature = "gpu")]
933     let mut has_vfio_gfx_device = false;
934     if !cfg.vfio.is_empty() {
935         let mut coiommu_attached_endpoints = Vec::new();
936 
937         for vfio_dev in &cfg.vfio {
938             let (dev, jail, viommu_mapper) = create_vfio_device(
939                 cfg.jail_config.as_ref(),
940                 vm,
941                 resources,
942                 add_control_tube,
943                 &vfio_dev.path,
944                 false,
945                 None,
946                 vfio_dev.guest_address,
947                 Some(&mut coiommu_attached_endpoints),
948                 vfio_dev.iommu,
949                 vfio_dev.dt_symbol.clone(),
950                 vfio_container_manager,
951             )?;
952             match dev {
953                 VfioDeviceVariant::Pci(vfio_pci_device) => {
954                     *iova_max_addr = Some(max(
955                         vfio_pci_device.get_max_iova(),
956                         iova_max_addr.unwrap_or(0),
957                     ));
958 
959                     #[cfg(feature = "gpu")]
960                     if vfio_pci_device.is_gfx() {
961                         has_vfio_gfx_device = true;
962                     }
963 
964                     if let Some(viommu_mapper) = viommu_mapper {
965                         iommu_attached_endpoints.insert(
966                             vfio_pci_device
967                                 .pci_address()
968                                 .context("not initialized")?
969                                 .to_u32(),
970                             Arc::new(Mutex::new(Box::new(viommu_mapper))),
971                         );
972                     }
973 
974                     devices.push((Box::new(vfio_pci_device), jail));
975                 }
976                 VfioDeviceVariant::Platform(vfio_plat_dev) => {
977                     devices.push((Box::new(vfio_plat_dev), jail));
978                 }
979             }
980         }
981 
982         if !coiommu_attached_endpoints.is_empty() || !iommu_attached_endpoints.is_empty() {
983             let mut buf = mem::MaybeUninit::<libc::rlimit64>::zeroed();
984             // SAFETY: trivially safe
985             let res = unsafe { libc::getrlimit64(libc::RLIMIT_MEMLOCK, buf.as_mut_ptr()) };
986             if res == 0 {
987                 // SAFETY: safe because getrlimit64 has returned success.
988                 let limit = unsafe { buf.assume_init() };
989                 let rlim_new = limit.rlim_cur.saturating_add(vm.get_memory().memory_size());
990                 let rlim_max = max(limit.rlim_max, rlim_new);
991                 if limit.rlim_cur < rlim_new {
992                     let limit_arg = libc::rlimit64 {
993                         rlim_cur: rlim_new,
994                         rlim_max,
995                     };
996                     // SAFETY: trivially safe
997                     let res = unsafe { libc::setrlimit64(libc::RLIMIT_MEMLOCK, &limit_arg) };
998                     if res != 0 {
999                         bail!("Set rlimit failed");
1000                     }
1001                 }
1002             } else {
1003                 bail!("Get rlimit failed");
1004             }
1005         }
1006         #[cfg(feature = "balloon")]
1007         let coiommu_tube: Option<Tube>;
1008         #[cfg(not(feature = "balloon"))]
1009         let coiommu_tube: Option<Tube> = None;
1010         if !coiommu_attached_endpoints.is_empty() {
1011             let vfio_container = vfio_container_manager
1012                 .get_container(IommuDevType::CoIommu, None as Option<&Path>)
1013                 .context("failed to get vfio container")?;
1014             let (coiommu_host_tube, coiommu_device_tube) =
1015                 Tube::pair().context("failed to create coiommu tube")?;
1016             add_control_tube(
1017                 VmMemoryTube {
1018                     tube: coiommu_host_tube,
1019                     expose_with_viommu: false,
1020                 }
1021                 .into(),
1022             );
1023             let vcpu_count = cfg.vcpu_count.unwrap_or(1) as u64;
1024             #[cfg(feature = "balloon")]
1025             match Tube::pair() {
1026                 Ok((x, y)) => {
1027                     coiommu_tube = Some(x);
1028                     balloon_inflate_tube = Some(y);
1029                 }
1030                 Err(x) => return Err(x).context("failed to create coiommu tube"),
1031             }
1032             let dev = CoIommuDev::new(
1033                 vm.get_memory().clone(),
1034                 vfio_container,
1035                 VmMemoryClient::new(coiommu_device_tube),
1036                 coiommu_tube,
1037                 coiommu_attached_endpoints,
1038                 vcpu_count,
1039                 cfg.coiommu_param.unwrap_or_default(),
1040             )
1041             .context("failed to create coiommu device")?;
1042 
1043             devices.push((
1044                 Box::new(dev),
1045                 simple_jail(cfg.jail_config.as_ref(), "coiommu_device")?,
1046             ));
1047         }
1048     }
1049 
1050     let stubs = create_virtio_devices(
1051         cfg,
1052         vm,
1053         resources,
1054         add_control_tube,
1055         vm_evt_wrtube,
1056         #[cfg(feature = "balloon")]
1057         balloon_inflate_tube,
1058         worker_process_pids,
1059         #[cfg(feature = "gpu")]
1060         render_server_fd,
1061         #[cfg(feature = "gpu")]
1062         has_vfio_gfx_device,
1063         #[cfg(feature = "registered_events")]
1064         registered_evt_q,
1065     )?;
1066 
1067     for stub in stubs {
1068         let (msi_host_tube, msi_device_tube) = Tube::pair().context("failed to create tube")?;
1069         add_control_tube(AnyControlTube::IrqTube(msi_host_tube));
1070 
1071         let shared_memory_tube = if stub.dev.get_shared_memory_region().is_some() {
1072             let (host_tube, device_tube) =
1073                 Tube::pair().context("failed to create shared memory tube")?;
1074             add_control_tube(
1075                 VmMemoryTube {
1076                     tube: host_tube,
1077                     expose_with_viommu: stub.dev.expose_shmem_descriptors_with_viommu(),
1078                 }
1079                 .into(),
1080             );
1081             Some(device_tube)
1082         } else {
1083             None
1084         };
1085 
1086         let (ioevent_host_tube, ioevent_device_tube) =
1087             Tube::pair().context("failed to create ioevent tube")?;
1088         add_control_tube(
1089             VmMemoryTube {
1090                 tube: ioevent_host_tube,
1091                 expose_with_viommu: false,
1092             }
1093             .into(),
1094         );
1095 
1096         let (host_tube, device_tube) =
1097             Tube::pair().context("failed to create device control tube")?;
1098         add_control_tube(TaggedControlTube::Vm(host_tube).into());
1099 
1100         let dev = VirtioPciDevice::new(
1101             vm.get_memory().clone(),
1102             stub.dev,
1103             msi_device_tube,
1104             cfg.disable_virtio_intx,
1105             shared_memory_tube.map(VmMemoryClient::new),
1106             VmMemoryClient::new(ioevent_device_tube),
1107             device_tube,
1108         )
1109         .context("failed to create virtio pci dev")?;
1110 
1111         devices.push((Box::new(dev) as Box<dyn BusDeviceObj>, stub.jail));
1112     }
1113 
1114     #[cfg(feature = "usb")]
1115     if cfg.usb {
1116         // Create xhci controller.
1117         let usb_controller = Box::new(XhciController::new(
1118             vm.get_memory().clone(),
1119             Box::new(usb_provider),
1120         ));
1121         devices.push((
1122             usb_controller,
1123             simple_jail(cfg.jail_config.as_ref(), "xhci_device")?,
1124         ));
1125     }
1126 
1127     for params in &cfg.stub_pci_devices {
1128         // Stub devices don't need jailing since they don't do anything.
1129         devices.push((Box::new(StubPciDevice::new(params)), None));
1130     }
1131 
1132     devices.push((
1133         Box::new(PvPanicPciDevice::new(vm_evt_wrtube.try_clone()?)),
1134         None,
1135     ));
1136 
1137     Ok(devices)
1138 }
1139 
create_mmio_file_backed_mappings( cfg: &Config, vm: &mut impl Vm, resources: &mut SystemAllocator, ) -> Result<()>1140 fn create_mmio_file_backed_mappings(
1141     cfg: &Config,
1142     vm: &mut impl Vm,
1143     resources: &mut SystemAllocator,
1144 ) -> Result<()> {
1145     for mapping in &cfg.file_backed_mappings_mmio {
1146         let file = mapping
1147             .open()
1148             .context("failed to open file for file-backed mapping")?;
1149         let prot = if mapping.writable {
1150             Protection::read_write()
1151         } else {
1152             Protection::read()
1153         };
1154         let size = mapping
1155             .size
1156             .try_into()
1157             .context("Invalid size for file-backed mapping")?;
1158         let memory_mapping = MemoryMappingBuilder::new(size)
1159             .from_file(&file)
1160             .offset(mapping.offset)
1161             .protection(prot)
1162             .build()
1163             .context("failed to map backing file for file-backed mapping")?;
1164 
1165         let mapping_range = AddressRange::from_start_and_size(mapping.address, mapping.size)
1166             .context("failed to convert to AddressRange")?;
1167         match resources.mmio_allocator_any().allocate_at(
1168             mapping_range,
1169             Alloc::FileBacked(mapping.address),
1170             "file-backed mapping".to_owned(),
1171         ) {
1172             // OutOfSpace just means that this mapping is not in the MMIO regions at all, so don't
1173             // consider it an error.
1174             // TODO(b/222769529): Reserve this region in a global memory address space allocator
1175             // once we have that so nothing else can accidentally overlap with it.
1176             Ok(()) | Err(resources::Error::OutOfSpace) => {}
1177             e => e.context("failed to allocate guest address for file-backed mapping")?,
1178         }
1179 
1180         vm.add_memory_region(
1181             GuestAddress(mapping.address),
1182             Box::new(memory_mapping),
1183             !mapping.writable,
1184             /* log_dirty_pages = */ false,
1185             MemCacheType::CacheCoherent,
1186         )
1187         .context("failed to configure file-backed mapping")?;
1188     }
1189 
1190     Ok(())
1191 }
1192 
1193 #[cfg(target_arch = "x86_64")]
1194 /// Collection of devices related to PCI hotplug.
1195 struct HotPlugStub {
1196     /// Map from bus index to hotplug bus.
1197     hotplug_buses: BTreeMap<u8, Arc<Mutex<dyn HotPlugBus>>>,
1198     /// Bus ranges of devices for virtio-iommu.
1199     iommu_bus_ranges: Vec<RangeInclusive<u32>>,
1200     /// Map from gpe index to GpeNotify devices.
1201     gpe_notify_devs: BTreeMap<u32, Arc<Mutex<dyn GpeNotify>>>,
1202     /// Map from bus index to GpeNotify devices.
1203     pme_notify_devs: BTreeMap<u8, Arc<Mutex<dyn PmeNotify>>>,
1204 }
1205 
1206 #[cfg(target_arch = "x86_64")]
1207 impl HotPlugStub {
1208     /// Constructs empty HotPlugStub.
new() -> Self1209     fn new() -> Self {
1210         Self {
1211             hotplug_buses: BTreeMap::new(),
1212             iommu_bus_ranges: Vec::new(),
1213             gpe_notify_devs: BTreeMap::new(),
1214             pme_notify_devs: BTreeMap::new(),
1215         }
1216     }
1217 }
1218 
1219 #[cfg(target_arch = "x86_64")]
1220 /// Creates PCIE root port with only virtual devices.
1221 ///
1222 /// user doesn't specify host pcie root port which link to this virtual pcie rp,
1223 /// find the empty bus and create a total virtual pcie rp
create_pure_virtual_pcie_root_port( sys_allocator: &mut SystemAllocator, add_control_tube: &mut impl FnMut(AnyControlTube), devices: &mut Vec<(Box<dyn BusDeviceObj>, Option<Minijail>)>, hp_bus_count: u8, ) -> Result<HotPlugStub>1224 fn create_pure_virtual_pcie_root_port(
1225     sys_allocator: &mut SystemAllocator,
1226     add_control_tube: &mut impl FnMut(AnyControlTube),
1227     devices: &mut Vec<(Box<dyn BusDeviceObj>, Option<Minijail>)>,
1228     hp_bus_count: u8,
1229 ) -> Result<HotPlugStub> {
1230     let mut hp_sec_buses = Vec::new();
1231     let mut hp_stub = HotPlugStub::new();
1232     // Create Pcie Root Port for non-root buses, each non-root bus device will be
1233     // connected behind a virtual pcie root port.
1234     for i in 1..255 {
1235         if sys_allocator.pci_bus_empty(i) {
1236             if hp_sec_buses.len() < hp_bus_count.into() {
1237                 hp_sec_buses.push(i);
1238             }
1239             continue;
1240         }
1241         let pcie_root_port = Arc::new(Mutex::new(PcieRootPort::new(i, false)));
1242         hp_stub
1243             .pme_notify_devs
1244             .insert(i, pcie_root_port.clone() as Arc<Mutex<dyn PmeNotify>>);
1245         let (msi_host_tube, msi_device_tube) = Tube::pair().context("failed to create tube")?;
1246         add_control_tube(AnyControlTube::IrqTube(msi_host_tube));
1247         let pci_bridge = Box::new(PciBridge::new(pcie_root_port.clone(), msi_device_tube));
1248         // no ipc is used if the root port disables hotplug
1249         devices.push((pci_bridge, None));
1250     }
1251 
1252     // Create Pcie Root Port for hot-plug
1253     if hp_sec_buses.len() < hp_bus_count.into() {
1254         return Err(anyhow!("no more addresses are available"));
1255     }
1256 
1257     for hp_sec_bus in hp_sec_buses {
1258         let pcie_root_port = Arc::new(Mutex::new(PcieRootPort::new(hp_sec_bus, true)));
1259         hp_stub.pme_notify_devs.insert(
1260             hp_sec_bus,
1261             pcie_root_port.clone() as Arc<Mutex<dyn PmeNotify>>,
1262         );
1263         let (msi_host_tube, msi_device_tube) = Tube::pair().context("failed to create tube")?;
1264         add_control_tube(AnyControlTube::IrqTube(msi_host_tube));
1265         let pci_bridge = Box::new(PciBridge::new(pcie_root_port.clone(), msi_device_tube));
1266 
1267         hp_stub.iommu_bus_ranges.push(RangeInclusive::new(
1268             PciAddress {
1269                 bus: pci_bridge.get_secondary_num(),
1270                 dev: 0,
1271                 func: 0,
1272             }
1273             .to_u32(),
1274             PciAddress {
1275                 bus: pci_bridge.get_subordinate_num(),
1276                 dev: 32,
1277                 func: 8,
1278             }
1279             .to_u32(),
1280         ));
1281 
1282         devices.push((pci_bridge, None));
1283         hp_stub
1284             .hotplug_buses
1285             .insert(hp_sec_bus, pcie_root_port as Arc<Mutex<dyn HotPlugBus>>);
1286     }
1287     Ok(hp_stub)
1288 }
1289 
setup_vm_components(cfg: &Config) -> Result<VmComponents>1290 fn setup_vm_components(cfg: &Config) -> Result<VmComponents> {
1291     let initrd_image = if let Some(initrd_path) = &cfg.initrd_path {
1292         Some(
1293             open_file_or_duplicate(initrd_path, OpenOptions::new().read(true))
1294                 .with_context(|| format!("failed to open initrd {}", initrd_path.display()))?,
1295         )
1296     } else {
1297         None
1298     };
1299     let pvm_fw_image = if let Some(pvm_fw_path) = &cfg.pvm_fw {
1300         Some(
1301             open_file_or_duplicate(pvm_fw_path, OpenOptions::new().read(true))
1302                 .with_context(|| format!("failed to open pvm_fw {}", pvm_fw_path.display()))?,
1303         )
1304     } else {
1305         None
1306     };
1307 
1308     let vm_image = match cfg.executable_path {
1309         Some(Executable::Kernel(ref kernel_path)) => VmImage::Kernel(
1310             open_file_or_duplicate(kernel_path, OpenOptions::new().read(true)).with_context(
1311                 || format!("failed to open kernel image {}", kernel_path.display()),
1312             )?,
1313         ),
1314         Some(Executable::Bios(ref bios_path)) => VmImage::Bios(
1315             open_file_or_duplicate(bios_path, OpenOptions::new().read(true))
1316                 .with_context(|| format!("failed to open bios {}", bios_path.display()))?,
1317         ),
1318         _ => panic!("Did not receive a bios or kernel, should be impossible."),
1319     };
1320 
1321     let swiotlb = if let Some(size) = cfg.swiotlb {
1322         Some(
1323             size.checked_mul(1024 * 1024)
1324                 .ok_or_else(|| anyhow!("requested swiotlb size too large"))?,
1325         )
1326     } else if matches!(cfg.protection_type, ProtectionType::Unprotected) {
1327         None
1328     } else {
1329         Some(64 * 1024 * 1024)
1330     };
1331 
1332     let (pflash_image, pflash_block_size) = if let Some(pflash_parameters) = &cfg.pflash_parameters
1333     {
1334         (
1335             Some(
1336                 open_file_or_duplicate(
1337                     &pflash_parameters.path,
1338                     OpenOptions::new().read(true).write(true),
1339                 )
1340                 .with_context(|| {
1341                     format!("failed to open pflash {}", pflash_parameters.path.display())
1342                 })?,
1343             ),
1344             pflash_parameters.block_size,
1345         )
1346     } else {
1347         (None, 0)
1348     };
1349 
1350     #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
1351     let mut cpu_frequencies = BTreeMap::new();
1352     #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
1353     let mut normalized_cpu_ipc_ratios = BTreeMap::new();
1354 
1355     // if --enable-fw-cfg or --fw-cfg was given, we want to enable fw_cfg
1356     let fw_cfg_enable = cfg.enable_fw_cfg || !cfg.fw_cfg_parameters.is_empty();
1357     let (cpu_clusters, cpu_capacity) = if cfg.host_cpu_topology {
1358         (
1359             Arch::get_host_cpu_clusters()?,
1360             Arch::get_host_cpu_capacity()?,
1361         )
1362     } else {
1363         (cfg.cpu_clusters.clone(), cfg.cpu_capacity.clone())
1364     };
1365 
1366     #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
1367     let mut vcpu_domain_paths = BTreeMap::new();
1368     #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
1369     let mut vcpu_domains = BTreeMap::new();
1370 
1371     #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
1372     if cfg.virt_cpufreq || cfg.virt_cpufreq_v2 {
1373         if !cfg.cpu_frequencies_khz.is_empty() {
1374             cpu_frequencies = cfg.cpu_frequencies_khz.clone();
1375         } else {
1376             match Arch::get_host_cpu_frequencies_khz() {
1377                 Ok(host_cpu_frequencies) => {
1378                     for cpu_id in 0..cfg.vcpu_count.unwrap_or(1) {
1379                         let vcpu_affinity = match cfg.vcpu_affinity.clone() {
1380                             Some(VcpuAffinity::Global(v)) => v,
1381                             Some(VcpuAffinity::PerVcpu(mut m)) => {
1382                                 m.remove(&cpu_id).unwrap_or_default()
1383                             }
1384                             None => {
1385                                 panic!("There must be some vcpu_affinity setting with VirtCpufreq enabled!")
1386                             }
1387                         };
1388 
1389                         // Check that the physical CPUs that the vCPU is affined to all share the
1390                         // same frequency domain.
1391                         if let Some(freq_domain) = host_cpu_frequencies.get(&vcpu_affinity[0]) {
1392                             for cpu in vcpu_affinity.iter() {
1393                                 if let Some(frequencies) = host_cpu_frequencies.get(cpu) {
1394                                     if frequencies != freq_domain {
1395                                         panic!("Affined CPUs do not share a frequency domain!");
1396                                     }
1397                                 }
1398                             }
1399                             cpu_frequencies.insert(cpu_id, freq_domain.clone());
1400                         } else {
1401                             panic!("No frequency domain for cpu:{}", cpu_id);
1402                         }
1403                     }
1404                 }
1405                 Err(e) => {
1406                     warn!("Unable to get host cpu frequencies {:#}", e);
1407                 }
1408             }
1409         }
1410 
1411         if !cpu_frequencies.is_empty() {
1412             let host_max_freqs = Arch::get_host_cpu_max_freq_khz()?;
1413             // Find the highest maximum frequency over all host CPUs. The guest CPU IPC ratios will
1414             // be normalized by dividing by this value.
1415             let host_max_freq = host_max_freqs.values().copied().max().unwrap_or_default();
1416 
1417             normalized_cpu_ipc_ratios = normalize_cpu_ipc_ratios(
1418                 cpu_frequencies.iter().map(|(cpu_id, frequencies)| {
1419                     (
1420                         *cpu_id,
1421                         frequencies.iter().copied().max().unwrap_or_default(),
1422                     )
1423                 }),
1424                 host_max_freq,
1425                 |cpu_id| cfg.cpu_ipc_ratio.get(&cpu_id).copied().unwrap_or(1024),
1426             )?;
1427 
1428             if !cfg.cpu_freq_domains.is_empty() {
1429                 let cgroup_path = cfg
1430                     .vcpu_cgroup_path
1431                     .clone()
1432                     .context("cpu_freq_domains requires vcpu_cgroup_path")?;
1433 
1434                 if !cgroup_path.join("cgroup.controllers").exists() {
1435                     panic!("CGroupsV2 must be enabled for cpu freq domain support!");
1436                 }
1437 
1438                 // Assign parent crosvm process to top level cgroup
1439                 let cgroup_procs_path = cgroup_path.join("cgroup.procs");
1440                 std::fs::write(
1441                     cgroup_procs_path.clone(),
1442                     process::id().to_string().as_bytes(),
1443                 )
1444                 .with_context(|| {
1445                     format!(
1446                         "failed to create vcpu-cgroup-path {}",
1447                         cgroup_procs_path.display(),
1448                     )
1449                 })?;
1450 
1451                 for (freq_domain_idx, cpus) in cfg.cpu_freq_domains.iter().enumerate() {
1452                     let vcpu_domain_path =
1453                         cgroup_path.join(format!("vcpu-domain{}", freq_domain_idx));
1454                     // Create subtree for domain
1455                     create_dir_all(&vcpu_domain_path)?;
1456 
1457                     // Set vcpu_domain cgroup type as 'threaded' to get thread level granularity
1458                     // controls
1459                     let cgroup_type_path = cgroup_path.join(vcpu_domain_path.join("cgroup.type"));
1460                     std::fs::write(cgroup_type_path.clone(), b"threaded").with_context(|| {
1461                         format!(
1462                             "failed to create vcpu-cgroup-path {}",
1463                             cgroup_type_path.display(),
1464                         )
1465                     })?;
1466                     for core_idx in cpus.iter() {
1467                         vcpu_domain_paths.insert(*core_idx, vcpu_domain_path.clone());
1468                         vcpu_domains.insert(*core_idx, freq_domain_idx as u32);
1469                     }
1470                 }
1471             }
1472         }
1473     }
1474 
1475     Ok(VmComponents {
1476         #[cfg(target_arch = "x86_64")]
1477         ac_adapter: cfg.ac_adapter,
1478         #[cfg(target_arch = "x86_64")]
1479         break_linux_pci_config_io: cfg.break_linux_pci_config_io,
1480         memory_size: cfg
1481             .memory
1482             .unwrap_or(256)
1483             .checked_mul(1024 * 1024)
1484             .ok_or_else(|| anyhow!("requested memory size too large"))?,
1485         swiotlb,
1486         fw_cfg_enable,
1487         bootorder_fw_cfg_blob: Vec::new(),
1488         vcpu_count: cfg.vcpu_count.unwrap_or(1),
1489         vcpu_affinity: cfg.vcpu_affinity.clone(),
1490         #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
1491         vcpu_domains,
1492         #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
1493         vcpu_domain_paths,
1494         #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
1495         cpu_frequencies,
1496         fw_cfg_parameters: cfg.fw_cfg_parameters.clone(),
1497         cpu_clusters,
1498         cpu_capacity,
1499         #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
1500         normalized_cpu_ipc_ratios,
1501         no_smt: cfg.no_smt,
1502         hugepages: cfg.hugepages,
1503         hv_cfg: hypervisor::Config {
1504             #[cfg(target_arch = "aarch64")]
1505             mte: cfg.mte,
1506             protection_type: cfg.protection_type,
1507         },
1508         vm_image,
1509         android_fstab: cfg
1510             .android_fstab
1511             .as_ref()
1512             .map(|x| {
1513                 File::open(x)
1514                     .with_context(|| format!("failed to open android fstab file {}", x.display()))
1515             })
1516             .map_or(Ok(None), |v| v.map(Some))?,
1517         pstore: cfg.pstore.clone(),
1518         pflash_block_size,
1519         pflash_image,
1520         initrd_image,
1521         extra_kernel_params: cfg.params.clone(),
1522         acpi_sdts: cfg
1523             .acpi_tables
1524             .iter()
1525             .map(|path| {
1526                 SDT::from_file(path)
1527                     .with_context(|| format!("failed to open ACPI file {}", path.display()))
1528             })
1529             .collect::<Result<Vec<SDT>>>()?,
1530         rt_cpus: cfg.rt_cpus.clone(),
1531         delay_rt: cfg.delay_rt,
1532         no_i8042: cfg.no_i8042,
1533         no_rtc: cfg.no_rtc,
1534         #[cfg(target_arch = "x86_64")]
1535         smbios: cfg.smbios.clone(),
1536         host_cpu_topology: cfg.host_cpu_topology,
1537         itmt: cfg.itmt,
1538         #[cfg(target_arch = "x86_64")]
1539         force_s2idle: cfg.force_s2idle,
1540         pvm_fw: pvm_fw_image,
1541         pci_config: cfg.pci_config,
1542         dynamic_power_coefficient: cfg.dynamic_power_coefficient.clone(),
1543         boot_cpu: cfg.boot_cpu,
1544         #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
1545         virt_cpufreq_v2: cfg.virt_cpufreq_v2,
1546         #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
1547         sve_config: cfg.sve.unwrap_or_default(),
1548     })
1549 }
1550 
1551 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
normalize_cpu_ipc_ratios( max_frequency_per_cpu: impl Iterator<Item = (usize, u32)>, host_max_freq: u32, cpu_ipc_ratio: impl Fn(usize) -> u32, ) -> Result<BTreeMap<usize, u32>>1552 fn normalize_cpu_ipc_ratios(
1553     max_frequency_per_cpu: impl Iterator<Item = (usize, u32)>,
1554     host_max_freq: u32,
1555     cpu_ipc_ratio: impl Fn(usize) -> u32,
1556 ) -> Result<BTreeMap<usize, u32>> {
1557     if host_max_freq == 0 {
1558         return Err(anyhow!("invalid host_max_freq 0"));
1559     }
1560 
1561     let host_max_freq = u64::from(host_max_freq);
1562     let mut normalized_cpu_ipc_ratios = BTreeMap::new();
1563     for (cpu_id, max_freq) in max_frequency_per_cpu {
1564         let ipc_ratio = u64::from(cpu_ipc_ratio(cpu_id));
1565         let max_freq = u64::from(max_freq);
1566 
1567         let normalized_cpu_ipc_ratio = (ipc_ratio * max_freq) / host_max_freq;
1568 
1569         normalized_cpu_ipc_ratios.insert(
1570             cpu_id,
1571             u32::try_from(normalized_cpu_ipc_ratio)
1572                 .context("normalized CPU IPC ratio out of u32 range")?,
1573         );
1574     }
1575 
1576     Ok(normalized_cpu_ipc_ratios)
1577 }
1578 
1579 #[derive(Copy, Clone, Debug, Eq, PartialEq)]
1580 pub enum ExitState {
1581     Reset,
1582     Stop,
1583     Crash,
1584     GuestPanic,
1585     WatchdogReset,
1586 }
1587 
1588 // Replaces ranges in `guest_mem_layout` that overlap with ranges in `file_backed_mappings`.
1589 // Returns the updated guest memory layout.
punch_holes_in_guest_mem_layout_for_mappings( guest_mem_layout: Vec<(GuestAddress, u64, MemoryRegionOptions)>, file_backed_mappings_ram: &[FileBackedMappingParameters], ) -> Result<Vec<(GuestAddress, u64, MemoryRegionOptions)>>1590 fn punch_holes_in_guest_mem_layout_for_mappings(
1591     guest_mem_layout: Vec<(GuestAddress, u64, MemoryRegionOptions)>,
1592     file_backed_mappings_ram: &[FileBackedMappingParameters],
1593 ) -> Result<Vec<(GuestAddress, u64, MemoryRegionOptions)>> {
1594     // Create a set containing (start, end) pairs with exclusive end (end = start + size; the byte
1595     // at end is not included in the range).
1596     let mut layout_set = BTreeSet::new();
1597     for (addr, size, options) in &guest_mem_layout {
1598         layout_set.insert((addr.offset(), addr.offset() + size, options.clone()));
1599     }
1600 
1601     // Make sure the RAM mappings are a subset of the RAM memory layout.
1602     // For simplicity, we currently require each mapping to be fully contained within a single
1603     // region of the input layout.
1604     for mapping in file_backed_mappings_ram {
1605         anyhow::ensure!(
1606             layout_set
1607                 .iter()
1608                 .any(|(addr, size, _)| *addr <= mapping.address
1609                     && mapping.address + mapping.size <= *addr + *size),
1610             "RAM file-backed-mapping must be a subset of a RAM region"
1611         );
1612     }
1613 
1614     for mapping in file_backed_mappings_ram.iter().cloned() {
1615         let mapping_start = mapping.address;
1616         let mapping_end = mapping_start + mapping.size;
1617         let mut purpose = None;
1618         // Repeatedly split overlapping guest memory regions until no overlaps remain.
1619         while let Some((range_start, range_end, options)) = layout_set
1620             .iter()
1621             .find(|&&(range_start, range_end, _)| {
1622                 mapping_start < range_end && mapping_end > range_start
1623             })
1624             .cloned()
1625         {
1626             let purpose = *purpose.get_or_insert(options.purpose);
1627             anyhow::ensure!(
1628                 options.purpose == purpose,
1629                 "RAM file-backed-mapping cannot span regions with different purposes: {:?} vs {:?}",
1630                 options.purpose,
1631                 purpose
1632             );
1633 
1634             layout_set.remove(&(range_start, range_end, options.clone()));
1635 
1636             if range_start < mapping_start {
1637                 layout_set.insert((range_start, mapping_start, options.clone()));
1638             }
1639             if range_end > mapping_end {
1640                 layout_set.insert((mapping_end, range_end, options));
1641             }
1642         }
1643         layout_set.insert((
1644             mapping_start,
1645             mapping_end,
1646             MemoryRegionOptions::new()
1647                 .purpose(purpose.unwrap())
1648                 .file_backed(mapping),
1649         ));
1650     }
1651 
1652     // Build the final guest memory layout from the modified layout_set.
1653     Ok(layout_set
1654         .into_iter()
1655         .map(|(start, end, options)| (GuestAddress(start), end - start, options))
1656         .collect())
1657 }
1658 
create_guest_memory( cfg: &Config, components: &VmComponents, arch_memory_layout: &<Arch as LinuxArch>::ArchMemoryLayout, hypervisor: &impl Hypervisor, ) -> Result<GuestMemory>1659 fn create_guest_memory(
1660     cfg: &Config,
1661     components: &VmComponents,
1662     arch_memory_layout: &<Arch as LinuxArch>::ArchMemoryLayout,
1663     hypervisor: &impl Hypervisor,
1664 ) -> Result<GuestMemory> {
1665     let guest_mem_layout = Arch::guest_memory_layout(components, arch_memory_layout, hypervisor)
1666         .context("failed to create guest memory layout")?;
1667 
1668     let guest_mem_layout = punch_holes_in_guest_mem_layout_for_mappings(
1669         guest_mem_layout,
1670         &cfg.file_backed_mappings_ram,
1671     )?;
1672 
1673     let mut guest_mem = GuestMemory::new_with_options(&guest_mem_layout)
1674         .context("failed to create guest memory")?;
1675     let mut mem_policy = MemoryPolicy::empty();
1676     if components.hugepages {
1677         mem_policy |= MemoryPolicy::USE_HUGEPAGES;
1678     }
1679 
1680     if cfg.lock_guest_memory {
1681         mem_policy |= MemoryPolicy::LOCK_GUEST_MEMORY;
1682     }
1683     guest_mem.set_memory_policy(mem_policy);
1684 
1685     if cfg.unmap_guest_memory_on_fork {
1686         // Note that this isn't compatible with sandboxing. We could potentially fix that by
1687         // delaying the call until after the sandboxed devices are forked. However, the main use
1688         // for this is in conjunction with protected VMs, where most of the guest memory has been
1689         // unshared with the host. We'd need to be confident that the guest memory is unshared with
1690         // the host only after the `use_dontfork` call and those details will vary by hypervisor.
1691         // So, for now we keep things simple to be safe.
1692         guest_mem.use_dontfork().context("use_dontfork failed")?;
1693     }
1694 
1695     Ok(guest_mem)
1696 }
1697 
1698 #[cfg(all(target_arch = "aarch64", feature = "geniezone"))]
run_gz(device_path: Option<&Path>, cfg: Config, components: VmComponents) -> Result<ExitState>1699 fn run_gz(device_path: Option<&Path>, cfg: Config, components: VmComponents) -> Result<ExitState> {
1700     use devices::GeniezoneKernelIrqChip;
1701     use hypervisor::geniezone::Geniezone;
1702     use hypervisor::geniezone::GeniezoneVcpu;
1703     use hypervisor::geniezone::GeniezoneVm;
1704 
1705     let device_path = device_path.unwrap_or(Path::new(GENIEZONE_PATH));
1706     let gzvm = Geniezone::new_with_path(device_path)
1707         .with_context(|| format!("failed to open GenieZone device {}", device_path.display()))?;
1708 
1709     let arch_memory_layout =
1710         Arch::arch_memory_layout(&components).context("failed to create arch memory layout")?;
1711     let guest_mem = create_guest_memory(&cfg, &components, &arch_memory_layout, &gzvm)?;
1712 
1713     #[cfg(feature = "swap")]
1714     let swap_controller = if let Some(swap_dir) = cfg.swap_dir.as_ref() {
1715         Some(
1716             SwapController::launch(guest_mem.clone(), swap_dir, cfg.jail_config.as_ref())
1717                 .context("launch vmm-swap monitor process")?,
1718         )
1719     } else {
1720         None
1721     };
1722 
1723     let vm =
1724         GeniezoneVm::new(&gzvm, guest_mem, components.hv_cfg).context("failed to create vm")?;
1725 
1726     // Check that the VM was actually created in protected mode as expected.
1727     if cfg.protection_type.isolates_memory() && !vm.check_capability(VmCap::Protected) {
1728         bail!("Failed to create protected VM");
1729     }
1730     let vm_clone = vm.try_clone().context("failed to clone vm")?;
1731 
1732     let ioapic_host_tube;
1733     let mut irq_chip = match cfg.irq_chip.unwrap_or(IrqChipKind::Kernel) {
1734         IrqChipKind::Split => bail!("Geniezone does not support split irqchip mode"),
1735         IrqChipKind::Userspace => bail!("Geniezone does not support userspace irqchip mode"),
1736         IrqChipKind::Kernel => {
1737             ioapic_host_tube = None;
1738             GeniezoneKernelIrqChip::new(vm_clone, components.vcpu_count)
1739                 .context("failed to create IRQ chip")?
1740         }
1741     };
1742 
1743     run_vm::<GeniezoneVcpu, GeniezoneVm>(
1744         cfg,
1745         components,
1746         &arch_memory_layout,
1747         vm,
1748         &mut irq_chip,
1749         ioapic_host_tube,
1750         #[cfg(feature = "swap")]
1751         swap_controller,
1752     )
1753 }
1754 
run_kvm(device_path: Option<&Path>, cfg: Config, components: VmComponents) -> Result<ExitState>1755 fn run_kvm(device_path: Option<&Path>, cfg: Config, components: VmComponents) -> Result<ExitState> {
1756     use devices::KvmKernelIrqChip;
1757     #[cfg(target_arch = "x86_64")]
1758     use devices::KvmSplitIrqChip;
1759     use hypervisor::kvm::Kvm;
1760     use hypervisor::kvm::KvmVcpu;
1761     use hypervisor::kvm::KvmVm;
1762 
1763     let device_path = device_path.unwrap_or(Path::new(KVM_PATH));
1764     let kvm = Kvm::new_with_path(device_path)
1765         .with_context(|| format!("failed to open KVM device {}", device_path.display()))?;
1766 
1767     let arch_memory_layout =
1768         Arch::arch_memory_layout(&components).context("failed to create arch memory layout")?;
1769     let guest_mem = create_guest_memory(&cfg, &components, &arch_memory_layout, &kvm)?;
1770 
1771     #[cfg(feature = "swap")]
1772     let swap_controller = if let Some(swap_dir) = cfg.swap_dir.as_ref() {
1773         Some(
1774             SwapController::launch(guest_mem.clone(), swap_dir, cfg.jail_config.as_ref())
1775                 .context("launch vmm-swap monitor process")?,
1776         )
1777     } else {
1778         None
1779     };
1780 
1781     let vm = KvmVm::new(&kvm, guest_mem, components.hv_cfg).context("failed to create vm")?;
1782 
1783     #[cfg(target_arch = "x86_64")]
1784     if cfg.itmt {
1785         vm.set_platform_info_read_access(false)
1786             .context("failed to disable MSR_PLATFORM_INFO read access")?;
1787     }
1788 
1789     // Check that the VM was actually created in protected mode as expected.
1790     // This check is only needed on aarch64. On x86_64, protected VM creation will fail
1791     // if protected mode is not supported.
1792     #[cfg(not(target_arch = "x86_64"))]
1793     if cfg.protection_type.isolates_memory() && !vm.check_capability(VmCap::Protected) {
1794         bail!("Failed to create protected VM");
1795     }
1796     let vm_clone = vm.try_clone().context("failed to clone vm")?;
1797 
1798     enum KvmIrqChip {
1799         #[cfg(target_arch = "x86_64")]
1800         Split(KvmSplitIrqChip),
1801         Kernel(KvmKernelIrqChip),
1802     }
1803 
1804     impl KvmIrqChip {
1805         fn as_mut(&mut self) -> &mut dyn IrqChipArch {
1806             match self {
1807                 #[cfg(target_arch = "x86_64")]
1808                 KvmIrqChip::Split(i) => i,
1809                 KvmIrqChip::Kernel(i) => i,
1810             }
1811         }
1812     }
1813 
1814     let ioapic_host_tube;
1815     let mut irq_chip = match cfg.irq_chip.unwrap_or(IrqChipKind::Kernel) {
1816         IrqChipKind::Userspace => {
1817             bail!("KVM userspace irqchip mode not implemented");
1818         }
1819         IrqChipKind::Split => {
1820             #[cfg(not(target_arch = "x86_64"))]
1821             bail!("KVM split irqchip mode only supported on x86 processors");
1822             #[cfg(target_arch = "x86_64")]
1823             {
1824                 let (host_tube, ioapic_device_tube) =
1825                     Tube::pair().context("failed to create tube")?;
1826                 ioapic_host_tube = Some(host_tube);
1827                 KvmIrqChip::Split(
1828                     KvmSplitIrqChip::new(
1829                         vm_clone,
1830                         components.vcpu_count,
1831                         ioapic_device_tube,
1832                         Some(24),
1833                     )
1834                     .context("failed to create IRQ chip")?,
1835                 )
1836             }
1837         }
1838         IrqChipKind::Kernel => {
1839             ioapic_host_tube = None;
1840             KvmIrqChip::Kernel(
1841                 KvmKernelIrqChip::new(vm_clone, components.vcpu_count)
1842                     .context("failed to create IRQ chip")?,
1843             )
1844         }
1845     };
1846 
1847     run_vm::<KvmVcpu, KvmVm>(
1848         cfg,
1849         components,
1850         &arch_memory_layout,
1851         vm,
1852         irq_chip.as_mut(),
1853         ioapic_host_tube,
1854         #[cfg(feature = "swap")]
1855         swap_controller,
1856     )
1857 }
1858 
1859 #[cfg(all(any(target_arch = "arm", target_arch = "aarch64"), feature = "gunyah"))]
run_gunyah( device_path: Option<&Path>, qcom_trusted_vm_id: Option<u16>, qcom_trusted_vm_pas_id: Option<u32>, cfg: Config, components: VmComponents, ) -> Result<ExitState>1860 fn run_gunyah(
1861     device_path: Option<&Path>,
1862     qcom_trusted_vm_id: Option<u16>,
1863     qcom_trusted_vm_pas_id: Option<u32>,
1864     cfg: Config,
1865     components: VmComponents,
1866 ) -> Result<ExitState> {
1867     use devices::GunyahIrqChip;
1868     use hypervisor::gunyah::Gunyah;
1869     use hypervisor::gunyah::GunyahVcpu;
1870     use hypervisor::gunyah::GunyahVm;
1871 
1872     let device_path = device_path.unwrap_or(Path::new(GUNYAH_PATH));
1873     let gunyah = Gunyah::new_with_path(device_path)
1874         .with_context(|| format!("failed to open Gunyah device {}", device_path.display()))?;
1875 
1876     let arch_memory_layout =
1877         Arch::arch_memory_layout(&components).context("failed to create arch memory layout")?;
1878     let guest_mem = create_guest_memory(&cfg, &components, &arch_memory_layout, &gunyah)?;
1879 
1880     #[cfg(feature = "swap")]
1881     let swap_controller = if let Some(swap_dir) = cfg.swap_dir.as_ref() {
1882         Some(
1883             SwapController::launch(guest_mem.clone(), swap_dir, cfg.jail_config.as_ref())
1884                 .context("launch vmm-swap monitor process")?,
1885         )
1886     } else {
1887         None
1888     };
1889 
1890     let vm = GunyahVm::new(&gunyah, qcom_trusted_vm_id, qcom_trusted_vm_pas_id, guest_mem, components.hv_cfg).context("failed to create vm")?;
1891 
1892     // Check that the VM was actually created in protected mode as expected.
1893     if cfg.protection_type.isolates_memory() && !vm.check_capability(VmCap::Protected) {
1894         bail!("Failed to create protected VM");
1895     }
1896 
1897     let vm_clone = vm.try_clone()?;
1898 
1899     run_vm::<GunyahVcpu, GunyahVm>(
1900         cfg,
1901         components,
1902         &arch_memory_layout,
1903         vm,
1904         &mut GunyahIrqChip::new(vm_clone)?,
1905         None,
1906         #[cfg(feature = "swap")]
1907         swap_controller,
1908     )
1909 }
1910 
1911 /// Choose a default hypervisor if no `--hypervisor` option was specified.
get_default_hypervisor() -> Option<HypervisorKind>1912 fn get_default_hypervisor() -> Option<HypervisorKind> {
1913     let kvm_path = Path::new(KVM_PATH);
1914     if kvm_path.exists() {
1915         return Some(HypervisorKind::Kvm {
1916             device: Some(kvm_path.to_path_buf()),
1917         });
1918     }
1919 
1920     #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
1921     #[cfg(feature = "geniezone")]
1922     {
1923         let gz_path = Path::new(GENIEZONE_PATH);
1924         if gz_path.exists() {
1925             return Some(HypervisorKind::Geniezone {
1926                 device: Some(gz_path.to_path_buf()),
1927             });
1928         }
1929     }
1930 
1931     #[cfg(all(
1932         unix,
1933         any(target_arch = "arm", target_arch = "aarch64"),
1934         feature = "gunyah"
1935     ))]
1936     {
1937         let gunyah_path = Path::new(GUNYAH_PATH);
1938         if gunyah_path.exists() {
1939             return Some(HypervisorKind::Gunyah {
1940                 device: Some(gunyah_path.to_path_buf()),
1941                 qcom_trusted_vm_id: None,
1942                 qcom_trusted_vm_pas_id: None,
1943             });
1944         }
1945     }
1946 
1947     None
1948 }
1949 
run_config(cfg: Config) -> Result<ExitState>1950 pub fn run_config(cfg: Config) -> Result<ExitState> {
1951     let components = setup_vm_components(&cfg)?;
1952 
1953     let hypervisor = cfg
1954         .hypervisor
1955         .clone()
1956         .or_else(get_default_hypervisor)
1957         .context("no enabled hypervisor")?;
1958 
1959     debug!("creating hypervisor: {:?}", hypervisor);
1960 
1961     match hypervisor {
1962         HypervisorKind::Kvm { device } => run_kvm(device.as_deref(), cfg, components),
1963         #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
1964         #[cfg(feature = "geniezone")]
1965         HypervisorKind::Geniezone { device } => run_gz(device.as_deref(), cfg, components),
1966         #[cfg(all(
1967             unix,
1968             any(target_arch = "arm", target_arch = "aarch64"),
1969             feature = "gunyah"
1970         ))]
1971         HypervisorKind::Gunyah { device,
1972                                  qcom_trusted_vm_id,
1973                                  qcom_trusted_vm_pas_id
1974                                } => run_gunyah(
1975                                         device.as_deref(),
1976                                         qcom_trusted_vm_id,
1977                                         qcom_trusted_vm_pas_id,
1978                                         cfg, components),
1979     }
1980 }
1981 
run_vm<Vcpu, V>( cfg: Config, #[allow(unused_mut)] mut components: VmComponents, arch_memory_layout: &<Arch as LinuxArch>::ArchMemoryLayout, mut vm: V, irq_chip: &mut dyn IrqChipArch, ioapic_host_tube: Option<Tube>, #[cfg(feature = "swap")] mut swap_controller: Option<SwapController>, ) -> Result<ExitState> where Vcpu: VcpuArch + 'static, V: VmArch + 'static,1982 fn run_vm<Vcpu, V>(
1983     cfg: Config,
1984     #[allow(unused_mut)] mut components: VmComponents,
1985     arch_memory_layout: &<Arch as LinuxArch>::ArchMemoryLayout,
1986     mut vm: V,
1987     irq_chip: &mut dyn IrqChipArch,
1988     ioapic_host_tube: Option<Tube>,
1989     #[cfg(feature = "swap")] mut swap_controller: Option<SwapController>,
1990 ) -> Result<ExitState>
1991 where
1992     Vcpu: VcpuArch + 'static,
1993     V: VmArch + 'static,
1994 {
1995     if cfg.jail_config.is_some() {
1996         // Printing something to the syslog before entering minijail so that libc's syslogger has a
1997         // chance to open files necessary for its operation, like `/etc/localtime`. After jailing,
1998         // access to those files will not be possible.
1999         info!("crosvm entering multiprocess mode");
2000     }
2001 
2002     let (metrics_send, metrics_recv) = Tube::directional_pair().context("metrics tube")?;
2003     metrics::initialize(metrics_send);
2004 
2005     #[cfg(all(feature = "pci-hotplug", feature = "swap"))]
2006     let swap_device_helper = match &swap_controller {
2007         Some(swap_controller) => Some(swap_controller.create_device_helper()?),
2008         None => None,
2009     };
2010     // pci-hotplug is only implemented for x86_64 for now, attempting to use it on other platform
2011     // would crash.
2012     #[cfg(all(feature = "pci-hotplug", not(target_arch = "x86_64")))]
2013     if cfg.pci_hotplug_slots.is_some() {
2014         bail!("pci-hotplug is not implemented for non x86_64 architecture");
2015     }
2016     // hotplug_manager must be created before vm is started since it forks jail warden process.
2017     #[cfg(feature = "pci-hotplug")]
2018     // TODO(293801301): Remove unused_mut after aarch64 support
2019     #[allow(unused_mut)]
2020     let mut hotplug_manager = if cfg.pci_hotplug_slots.is_some() {
2021         Some(PciHotPlugManager::new(
2022             vm.get_memory().clone(),
2023             &cfg,
2024             #[cfg(feature = "swap")]
2025             swap_device_helper,
2026         )?)
2027     } else {
2028         None
2029     };
2030 
2031     #[cfg(feature = "usb")]
2032     let (usb_control_tube, usb_provider) =
2033         DeviceProvider::new().context("failed to create usb provider")?;
2034 
2035     // Masking signals is inherently dangerous, since this can persist across clones/execs. Do this
2036     // before any jailed devices have been spawned, so that we can catch any of them that fail very
2037     // quickly.
2038     let sigchld_fd = SignalFd::new(libc::SIGCHLD).context("failed to create signalfd")?;
2039 
2040     let control_server_socket = match &cfg.socket_path {
2041         Some(path) => Some(UnlinkUnixSeqpacketListener(
2042             UnixSeqpacketListener::bind(path).context("failed to create control server")?,
2043         )),
2044         None => None,
2045     };
2046 
2047     let mut all_control_tubes = Vec::new();
2048     let mut add_control_tube = |t| all_control_tubes.push(t);
2049 
2050     if let Some(ioapic_host_tube) = ioapic_host_tube {
2051         add_control_tube(AnyControlTube::IrqTube(ioapic_host_tube));
2052     }
2053 
2054     let battery = if cfg.battery_config.is_some() {
2055         #[cfg_attr(
2056             not(feature = "power-monitor-powerd"),
2057             allow(clippy::manual_map, clippy::needless_match, unused_mut)
2058         )]
2059         let jail = if let Some(jail_config) = cfg.jail_config.as_ref() {
2060             let mut config = SandboxConfig::new(jail_config, "battery");
2061             #[cfg(feature = "power-monitor-powerd")]
2062             {
2063                 config.bind_mounts = true;
2064             }
2065             let mut jail =
2066                 create_sandbox_minijail(&jail_config.pivot_root, MAX_OPEN_FILES_DEFAULT, &config)?;
2067 
2068             // Setup a bind mount to the system D-Bus socket if the powerd monitor is used.
2069             #[cfg(feature = "power-monitor-powerd")]
2070             {
2071                 let system_bus_socket_path = Path::new("/run/dbus/system_bus_socket");
2072                 jail.mount_bind(system_bus_socket_path, system_bus_socket_path, true)?;
2073             }
2074             Some(jail)
2075         } else {
2076             None
2077         };
2078         (cfg.battery_config.as_ref().map(|c| c.type_), jail)
2079     } else {
2080         (cfg.battery_config.as_ref().map(|c| c.type_), None)
2081     };
2082 
2083     let (vm_evt_wrtube, vm_evt_rdtube) =
2084         Tube::directional_pair().context("failed to create vm event tube")?;
2085 
2086     let pstore_size = components.pstore.as_ref().map(|pstore| pstore.size as u64);
2087     let mut sys_allocator = SystemAllocator::new(
2088         Arch::get_system_allocator_config(&vm, arch_memory_layout),
2089         pstore_size,
2090         &cfg.mmio_address_ranges,
2091     )
2092     .context("failed to create system allocator")?;
2093 
2094     let ramoops_region = match &components.pstore {
2095         Some(pstore) => Some(
2096             arch::pstore::create_memory_region(
2097                 &mut vm,
2098                 sys_allocator.reserved_region().unwrap(),
2099                 pstore,
2100             )
2101             .context("failed to allocate pstore region")?,
2102         ),
2103         None => None,
2104     };
2105 
2106     create_mmio_file_backed_mappings(&cfg, &mut vm, &mut sys_allocator)?;
2107 
2108     #[cfg(feature = "gpu")]
2109     // Hold on to the render server jail so it keeps running until we exit run_vm()
2110     let (_render_server_jail, render_server_fd) =
2111         if let Some(parameters) = &cfg.gpu_render_server_parameters {
2112             let (jail, fd) = start_gpu_render_server(&cfg, parameters)?;
2113             (Some(ScopedMinijail(jail)), Some(fd))
2114         } else {
2115             (None, None)
2116         };
2117 
2118     let mut iommu_attached_endpoints: BTreeMap<u32, Arc<Mutex<Box<dyn MemoryMapperTrait>>>> =
2119         BTreeMap::new();
2120     let mut iova_max_addr: Option<u64> = None;
2121 
2122     let mut vfio_container_manager = VfioContainerManager::new();
2123 
2124     #[cfg(feature = "registered_events")]
2125     let (reg_evt_wrtube, reg_evt_rdtube) =
2126         Tube::directional_pair().context("failed to create registered event tube")?;
2127 
2128     let mut worker_process_pids = BTreeSet::new();
2129 
2130     let mut devices = create_devices(
2131         &cfg,
2132         &mut vm,
2133         &mut sys_allocator,
2134         &mut add_control_tube,
2135         &vm_evt_wrtube,
2136         &mut iommu_attached_endpoints,
2137         #[cfg(feature = "usb")]
2138         usb_provider,
2139         #[cfg(feature = "gpu")]
2140         render_server_fd,
2141         &mut iova_max_addr,
2142         #[cfg(feature = "registered_events")]
2143         &reg_evt_wrtube,
2144         &mut vfio_container_manager,
2145         &mut worker_process_pids,
2146     )?;
2147 
2148     #[cfg(feature = "pci-hotplug")]
2149     // TODO(293801301): Remove unused_variables after aarch64 support
2150     #[allow(unused_variables)]
2151     let pci_hotplug_slots = cfg.pci_hotplug_slots;
2152     #[cfg(not(feature = "pci-hotplug"))]
2153     #[allow(unused_variables)]
2154     let pci_hotplug_slots: Option<u8> = None;
2155     #[cfg(target_arch = "x86_64")]
2156     let hp_stub = create_pure_virtual_pcie_root_port(
2157         &mut sys_allocator,
2158         &mut add_control_tube,
2159         &mut devices,
2160         pci_hotplug_slots.unwrap_or(1),
2161     )?;
2162 
2163     arch::assign_pci_addresses(&mut devices, &mut sys_allocator)?;
2164 
2165     let pci_devices: Vec<&dyn PciDevice> = devices
2166         .iter()
2167         .filter_map(|d| (d.0).as_pci_device())
2168         .collect();
2169 
2170     let virtio_devices: Vec<(&dyn VirtioDevice, devices::PciAddress)> = pci_devices
2171         .into_iter()
2172         .flat_map(|s| {
2173             if let Some(virtio_pci_device) = s.as_virtio_pci_device() {
2174                 std::iter::zip(
2175                     Some(virtio_pci_device.virtio_device()),
2176                     virtio_pci_device.pci_address(),
2177                 )
2178                 .next()
2179             } else {
2180                 None
2181             }
2182         })
2183         .collect();
2184 
2185     let mut open_firmware_device_paths: Vec<(Vec<u8>, usize)> = virtio_devices
2186         .iter()
2187         .flat_map(|s| (s.0).bootorder_fw_cfg(s.1.dev))
2188         .collect();
2189 
2190     // order the OpenFirmware device paths, in ascending order, by their boot_index
2191     open_firmware_device_paths.sort_by(|a, b| (a.1).cmp(&(b.1)));
2192 
2193     // "/pci@iocf8/" is x86 specific and represents the root at the system bus port
2194     let mut bootorder_fw_cfg_blob =
2195         open_firmware_device_paths
2196             .into_iter()
2197             .fold(Vec::new(), |a, b| {
2198                 a.into_iter()
2199                     .chain("/pci@i0cf8/".as_bytes().iter().copied())
2200                     .chain(b.0)
2201                     .chain("\n".as_bytes().iter().copied())
2202                     .collect()
2203             });
2204 
2205     // the "bootorder" file is expected to end with a null terminator
2206     bootorder_fw_cfg_blob.push(0);
2207 
2208     components.bootorder_fw_cfg_blob = bootorder_fw_cfg_blob;
2209 
2210     // if the bootindex argument was given, we want to make sure that fw_cfg is enabled so the
2211     // "bootorder" file can be accessed by the guest.
2212     components.fw_cfg_enable |= components.bootorder_fw_cfg_blob.len() > 1;
2213 
2214     let (translate_response_senders, request_rx) = setup_virtio_access_platform(
2215         &mut sys_allocator,
2216         &mut iommu_attached_endpoints,
2217         &mut devices,
2218     )?;
2219 
2220     #[cfg(target_arch = "x86_64")]
2221     let iommu_bus_ranges = hp_stub.iommu_bus_ranges;
2222     #[cfg(not(target_arch = "x86_64"))]
2223     let iommu_bus_ranges = Vec::new();
2224 
2225     let iommu_host_tube = if !iommu_attached_endpoints.is_empty()
2226         || (cfg.vfio_isolate_hotplug && !iommu_bus_ranges.is_empty())
2227     {
2228         let (iommu_host_tube, iommu_device_tube) = Tube::pair().context("failed to create tube")?;
2229         let iommu_dev = create_iommu_device(
2230             cfg.protection_type,
2231             cfg.jail_config.as_ref(),
2232             iova_max_addr.unwrap_or(u64::MAX),
2233             iommu_attached_endpoints,
2234             iommu_bus_ranges,
2235             translate_response_senders,
2236             request_rx,
2237             iommu_device_tube,
2238         )?;
2239 
2240         let (msi_host_tube, msi_device_tube) = Tube::pair().context("failed to create tube")?;
2241         add_control_tube(AnyControlTube::IrqTube(msi_host_tube));
2242         let (ioevent_host_tube, ioevent_device_tube) =
2243             Tube::pair().context("failed to create ioevent tube")?;
2244         add_control_tube(
2245             VmMemoryTube {
2246                 tube: ioevent_host_tube,
2247                 expose_with_viommu: false,
2248             }
2249             .into(),
2250         );
2251         let (host_tube, device_tube) =
2252             Tube::pair().context("failed to create device control tube")?;
2253         add_control_tube(TaggedControlTube::Vm(host_tube).into());
2254         let mut dev = VirtioPciDevice::new(
2255             vm.get_memory().clone(),
2256             iommu_dev.dev,
2257             msi_device_tube,
2258             cfg.disable_virtio_intx,
2259             None,
2260             VmMemoryClient::new(ioevent_device_tube),
2261             device_tube,
2262         )
2263         .context("failed to create virtio pci dev")?;
2264         // early reservation for viommu.
2265         dev.allocate_address(&mut sys_allocator)
2266             .context("failed to allocate resources early for virtio pci dev")?;
2267         let dev = Box::new(dev);
2268         devices.push((dev, iommu_dev.jail));
2269         Some(iommu_host_tube)
2270     } else {
2271         None
2272     };
2273 
2274     #[cfg(target_arch = "x86_64")]
2275     for device in devices
2276         .iter_mut()
2277         .filter_map(|(dev, _)| dev.as_pci_device_mut())
2278     {
2279         device.generate_acpi(&mut components.acpi_sdts);
2280     }
2281 
2282     // KVM_CREATE_VCPU uses apic id for x86 and uses cpu id for others.
2283     let mut vcpu_ids = Vec::new();
2284 
2285     let guest_suspended_cvar = if cfg.force_s2idle {
2286         Some(Arc::new((Mutex::new(false), Condvar::new())))
2287     } else {
2288         None
2289     };
2290 
2291     let dt_overlays = cfg
2292         .device_tree_overlay
2293         .iter()
2294         .map(|o| {
2295             Ok(DtbOverlay {
2296                 file: open_file_or_duplicate(o.path.as_path(), OpenOptions::new().read(true))
2297                     .with_context(|| {
2298                         format!("failed to open device tree overlay {}", o.path.display())
2299                     })?,
2300                 do_filter: o.filter_devs,
2301             })
2302         })
2303         .collect::<Result<Vec<DtbOverlay>>>()?;
2304 
2305     #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
2306     let vcpu_domain_paths = components.vcpu_domain_paths.clone();
2307 
2308     let mut linux = Arch::build_vm::<V, Vcpu>(
2309         components,
2310         arch_memory_layout,
2311         &vm_evt_wrtube,
2312         &mut sys_allocator,
2313         &cfg.serial_parameters,
2314         simple_jail(cfg.jail_config.as_ref(), "serial_device")?,
2315         battery,
2316         vm,
2317         ramoops_region,
2318         devices,
2319         irq_chip,
2320         &mut vcpu_ids,
2321         cfg.dump_device_tree_blob.clone(),
2322         simple_jail(cfg.jail_config.as_ref(), "serial_device")?,
2323         #[cfg(target_arch = "x86_64")]
2324         simple_jail(cfg.jail_config.as_ref(), "block_device")?,
2325         #[cfg(target_arch = "x86_64")]
2326         simple_jail(cfg.jail_config.as_ref(), "fw_cfg_device")?,
2327         #[cfg(feature = "swap")]
2328         &mut swap_controller,
2329         guest_suspended_cvar.clone(),
2330         dt_overlays,
2331         cfg.fdt_position,
2332         cfg.no_pmu,
2333     )
2334     .context("the architecture failed to build the vm")?;
2335 
2336     for tube in linux.vm_request_tubes.drain(..) {
2337         add_control_tube(TaggedControlTube::Vm(tube).into());
2338     }
2339 
2340     #[cfg(target_arch = "x86_64")]
2341     let (hp_control_tube, hp_worker_tube) = mpsc::channel();
2342     #[cfg(all(feature = "pci-hotplug", target_arch = "x86_64"))]
2343     if let Some(hotplug_manager) = &mut hotplug_manager {
2344         hotplug_manager.set_rootbus_controller(hp_control_tube.clone())?;
2345     }
2346     #[cfg(target_arch = "x86_64")]
2347     let hp_thread = {
2348         for (bus_num, hp_bus) in hp_stub.hotplug_buses.into_iter() {
2349             #[cfg(feature = "pci-hotplug")]
2350             if let Some(hotplug_manager) = &mut hotplug_manager {
2351                 hotplug_manager.add_port(hp_bus)?;
2352             } else {
2353                 linux.hotplug_bus.insert(bus_num, hp_bus);
2354             }
2355             #[cfg(not(feature = "pci-hotplug"))]
2356             linux.hotplug_bus.insert(bus_num, hp_bus);
2357         }
2358 
2359         if let Some(pm) = &linux.pm {
2360             for (gpe, notify_dev) in hp_stub.gpe_notify_devs.into_iter() {
2361                 pm.lock().register_gpe_notify_dev(gpe, notify_dev);
2362             }
2363             for (bus, notify_dev) in hp_stub.pme_notify_devs.into_iter() {
2364                 pm.lock().register_pme_notify_dev(bus, notify_dev);
2365             }
2366         }
2367 
2368         let (hp_vm_mem_host_tube, hp_vm_mem_worker_tube) =
2369             Tube::pair().context("failed to create tube")?;
2370         add_control_tube(
2371             VmMemoryTube {
2372                 tube: hp_vm_mem_host_tube,
2373                 expose_with_viommu: false,
2374             }
2375             .into(),
2376         );
2377 
2378         let supports_readonly_mapping = linux.vm.supports_readonly_mapping();
2379         let pci_root = linux.root_config.clone();
2380         std::thread::Builder::new()
2381             .name("pci_root".to_string())
2382             .spawn(move || {
2383                 start_pci_root_worker(
2384                     supports_readonly_mapping,
2385                     pci_root,
2386                     hp_worker_tube,
2387                     hp_vm_mem_worker_tube,
2388                 )
2389             })?
2390     };
2391 
2392     let flags = RutabagaGrallocBackendFlags::new().disable_vulkano();
2393     let gralloc = RutabagaGralloc::new(flags).context("failed to create gralloc")?;
2394 
2395     run_control(
2396         linux,
2397         sys_allocator,
2398         cfg,
2399         control_server_socket,
2400         all_control_tubes,
2401         #[cfg(feature = "usb")]
2402         usb_control_tube,
2403         vm_evt_rdtube,
2404         vm_evt_wrtube,
2405         sigchld_fd,
2406         gralloc,
2407         vcpu_ids,
2408         iommu_host_tube,
2409         #[cfg(target_arch = "x86_64")]
2410         hp_control_tube,
2411         #[cfg(target_arch = "x86_64")]
2412         hp_thread,
2413         #[cfg(feature = "pci-hotplug")]
2414         hotplug_manager,
2415         #[cfg(feature = "swap")]
2416         swap_controller,
2417         #[cfg(feature = "registered_events")]
2418         reg_evt_rdtube,
2419         guest_suspended_cvar,
2420         metrics_recv,
2421         vfio_container_manager,
2422         worker_process_pids,
2423         #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
2424         vcpu_domain_paths,
2425     )
2426 }
2427 
2428 // Hotplug command is facing dead lock issue when it tries to acquire the lock
2429 // for pci root in the vm control thread. Dead lock could happen when the vm
2430 // control thread(Thread A namely) is handling the hotplug command and it tries
2431 // to get the lock for pci root. However, the lock is already hold by another
2432 // device in thread B, which is actively sending an vm control to be handled by
2433 // thread A and waiting for response. However, thread A is blocked on acquiring
2434 // the lock, so dead lock happens. In order to resolve this issue, we add this
2435 // worker thread and push all work that locks pci root to this thread.
2436 #[cfg(target_arch = "x86_64")]
start_pci_root_worker( supports_readonly_mapping: bool, pci_root: Arc<Mutex<PciRoot>>, hp_device_tube: mpsc::Receiver<PciRootCommand>, vm_control_tube: Tube, )2437 fn start_pci_root_worker(
2438     supports_readonly_mapping: bool,
2439     pci_root: Arc<Mutex<PciRoot>>,
2440     hp_device_tube: mpsc::Receiver<PciRootCommand>,
2441     vm_control_tube: Tube,
2442 ) {
2443     struct PciMmioMapperTube {
2444         supports_readonly_mapping: bool,
2445         vm_control_tube: Tube,
2446         registered_regions: BTreeMap<u32, VmMemoryRegionId>,
2447         next_id: u32,
2448     }
2449 
2450     impl PciMmioMapper for PciMmioMapperTube {
2451         fn supports_readonly_mapping(&self) -> bool {
2452             self.supports_readonly_mapping
2453         }
2454 
2455         fn add_mapping(&mut self, addr: GuestAddress, shmem: &SharedMemory) -> anyhow::Result<u32> {
2456             let shmem = shmem
2457                 .try_clone()
2458                 .context("failed to create new SharedMemory")?;
2459             self.vm_control_tube
2460                 .send(&VmMemoryRequest::RegisterMemory {
2461                     source: VmMemorySource::SharedMemory(shmem),
2462                     dest: VmMemoryDestination::GuestPhysicalAddress(addr.0),
2463                     prot: Protection::read(),
2464                     cache: MemCacheType::CacheCoherent,
2465                 })
2466                 .context("failed to send request")?;
2467             match self.vm_control_tube.recv::<VmMemoryResponse>() {
2468                 Ok(VmMemoryResponse::RegisterMemory { region_id, .. }) => {
2469                     let cur_id = self.next_id;
2470                     self.registered_regions.insert(cur_id, region_id);
2471                     self.next_id += 1;
2472                     Ok(cur_id)
2473                 }
2474                 res => bail!("Bad response: {:?}", res),
2475             }
2476         }
2477     }
2478 
2479     let mut mapper = PciMmioMapperTube {
2480         supports_readonly_mapping,
2481         vm_control_tube,
2482         registered_regions: BTreeMap::new(),
2483         next_id: 0,
2484     };
2485 
2486     loop {
2487         match hp_device_tube.recv() {
2488             Ok(cmd) => match cmd {
2489                 PciRootCommand::Add(addr, device) => {
2490                     if let Err(e) = pci_root.lock().add_device(addr, device, &mut mapper) {
2491                         error!("failed to add hotplugged device to PCI root port: {}", e);
2492                     }
2493                 }
2494                 PciRootCommand::AddBridge(pci_bus) => {
2495                     if let Err(e) = pci_root.lock().add_bridge(pci_bus) {
2496                         error!("failed to add hotplugged bridge to PCI root port: {}", e);
2497                     }
2498                 }
2499                 PciRootCommand::Remove(addr) => {
2500                     pci_root.lock().remove_device(addr);
2501                 }
2502                 PciRootCommand::Kill => break,
2503             },
2504             Err(e) => {
2505                 error!("Error: pci root worker channel closed: {}", e);
2506                 break;
2507             }
2508         }
2509     }
2510 }
2511 
2512 #[cfg(target_arch = "x86_64")]
get_hp_bus<V: VmArch, Vcpu: VcpuArch>( linux: &RunnableLinuxVm<V, Vcpu>, host_addr: PciAddress, ) -> Result<Arc<Mutex<dyn HotPlugBus>>>2513 fn get_hp_bus<V: VmArch, Vcpu: VcpuArch>(
2514     linux: &RunnableLinuxVm<V, Vcpu>,
2515     host_addr: PciAddress,
2516 ) -> Result<Arc<Mutex<dyn HotPlugBus>>> {
2517     for (_, hp_bus) in linux.hotplug_bus.iter() {
2518         if hp_bus.lock().is_match(host_addr).is_some() {
2519             return Ok(hp_bus.clone());
2520         }
2521     }
2522     Err(anyhow!("Failed to find a suitable hotplug bus"))
2523 }
2524 
2525 #[cfg(target_arch = "x86_64")]
add_hotplug_device<V: VmArch, Vcpu: VcpuArch>( linux: &mut RunnableLinuxVm<V, Vcpu>, sys_allocator: &mut SystemAllocator, cfg: &Config, add_control_tube: &mut impl FnMut(AnyControlTube), hp_control_tube: &mpsc::Sender<PciRootCommand>, iommu_host_tube: Option<&Tube>, device: &HotPlugDeviceInfo, #[cfg(feature = "swap")] swap_controller: &mut Option<SwapController>, vfio_container_manager: &mut VfioContainerManager, ) -> Result<()>2526 fn add_hotplug_device<V: VmArch, Vcpu: VcpuArch>(
2527     linux: &mut RunnableLinuxVm<V, Vcpu>,
2528     sys_allocator: &mut SystemAllocator,
2529     cfg: &Config,
2530     add_control_tube: &mut impl FnMut(AnyControlTube),
2531     hp_control_tube: &mpsc::Sender<PciRootCommand>,
2532     iommu_host_tube: Option<&Tube>,
2533     device: &HotPlugDeviceInfo,
2534     #[cfg(feature = "swap")] swap_controller: &mut Option<SwapController>,
2535     vfio_container_manager: &mut VfioContainerManager,
2536 ) -> Result<()> {
2537     let host_addr = PciAddress::from_path(&device.path)
2538         .context("failed to parse hotplug device's PCI address")?;
2539     let hp_bus = get_hp_bus(linux, host_addr)?;
2540 
2541     let (hotplug_key, pci_address) = match device.device_type {
2542         HotPlugDeviceType::UpstreamPort | HotPlugDeviceType::DownstreamPort => {
2543             let (vm_host_tube, vm_device_tube) = Tube::pair().context("failed to create tube")?;
2544             add_control_tube(TaggedControlTube::Vm(vm_host_tube).into());
2545             let (msi_host_tube, msi_device_tube) = Tube::pair().context("failed to create tube")?;
2546             add_control_tube(AnyControlTube::IrqTube(msi_host_tube));
2547             let pcie_host = PcieHostPort::new(device.path.as_path(), vm_device_tube)?;
2548             let (hotplug_key, pci_bridge) = match device.device_type {
2549                 HotPlugDeviceType::UpstreamPort => {
2550                     let hotplug_key = HotPlugKey::HostUpstreamPort { host_addr };
2551                     let pcie_upstream_port = Arc::new(Mutex::new(PcieUpstreamPort::new_from_host(
2552                         pcie_host, true,
2553                     )?));
2554                     let pci_bridge =
2555                         Box::new(PciBridge::new(pcie_upstream_port.clone(), msi_device_tube));
2556                     linux
2557                         .hotplug_bus
2558                         .insert(pci_bridge.get_secondary_num(), pcie_upstream_port);
2559                     (hotplug_key, pci_bridge)
2560                 }
2561                 HotPlugDeviceType::DownstreamPort => {
2562                     let hotplug_key = HotPlugKey::HostDownstreamPort { host_addr };
2563                     let pcie_downstream_port = Arc::new(Mutex::new(
2564                         PcieDownstreamPort::new_from_host(pcie_host, true)?,
2565                     ));
2566                     let pci_bridge = Box::new(PciBridge::new(
2567                         pcie_downstream_port.clone(),
2568                         msi_device_tube,
2569                     ));
2570                     linux
2571                         .hotplug_bus
2572                         .insert(pci_bridge.get_secondary_num(), pcie_downstream_port);
2573                     (hotplug_key, pci_bridge)
2574                 }
2575                 _ => {
2576                     bail!("Impossible to reach here")
2577                 }
2578             };
2579             let pci_address = Arch::register_pci_device(
2580                 linux,
2581                 pci_bridge,
2582                 None,
2583                 sys_allocator,
2584                 hp_control_tube,
2585                 #[cfg(feature = "swap")]
2586                 swap_controller,
2587             )?;
2588 
2589             (hotplug_key, pci_address)
2590         }
2591         HotPlugDeviceType::EndPoint => {
2592             let hotplug_key = HotPlugKey::HostVfio { host_addr };
2593             let (vfio_device, jail, viommu_mapper) = create_vfio_device(
2594                 cfg.jail_config.as_ref(),
2595                 &linux.vm,
2596                 sys_allocator,
2597                 add_control_tube,
2598                 &device.path,
2599                 true,
2600                 None,
2601                 None,
2602                 None,
2603                 if iommu_host_tube.is_some() {
2604                     IommuDevType::VirtioIommu
2605                 } else {
2606                     IommuDevType::NoIommu
2607                 },
2608                 None,
2609                 vfio_container_manager,
2610             )?;
2611             let vfio_pci_device = match vfio_device {
2612                 VfioDeviceVariant::Pci(pci) => Box::new(pci),
2613                 VfioDeviceVariant::Platform(_) => bail!("vfio platform hotplug not supported"),
2614             };
2615             let pci_address = Arch::register_pci_device(
2616                 linux,
2617                 vfio_pci_device,
2618                 jail,
2619                 sys_allocator,
2620                 hp_control_tube,
2621                 #[cfg(feature = "swap")]
2622                 swap_controller,
2623             )?;
2624             if let Some(iommu_host_tube) = iommu_host_tube {
2625                 let endpoint_addr = pci_address.to_u32();
2626                 let vfio_wrapper = viommu_mapper.context("expected mapper")?;
2627                 let descriptor = vfio_wrapper.clone_as_raw_descriptor()?;
2628                 let request =
2629                     VirtioIOMMURequest::VfioCommand(VirtioIOMMUVfioCommand::VfioDeviceAdd {
2630                         endpoint_addr,
2631                         wrapper_id: vfio_wrapper.id(),
2632                         container: {
2633                             // SAFETY:
2634                             // Safe because the descriptor is uniquely owned by `descriptor`.
2635                             unsafe { File::from_raw_descriptor(descriptor) }
2636                         },
2637                     });
2638                 match virtio_iommu_request(iommu_host_tube, &request)
2639                     .map_err(|_| VirtioIOMMUVfioError::SocketFailed)?
2640                 {
2641                     VirtioIOMMUResponse::VfioResponse(VirtioIOMMUVfioResult::Ok) => (),
2642                     resp => bail!("Unexpected message response: {:?}", resp),
2643                 }
2644             }
2645 
2646             (hotplug_key, pci_address)
2647         }
2648     };
2649     hp_bus.lock().add_hotplug_device(hotplug_key, pci_address);
2650     if device.hp_interrupt {
2651         hp_bus.lock().hot_plug(pci_address)?;
2652     }
2653     Ok(())
2654 }
2655 
2656 #[cfg(feature = "pci-hotplug")]
add_hotplug_net<V: VmArch, Vcpu: VcpuArch>( linux: &mut RunnableLinuxVm<V, Vcpu>, sys_allocator: &mut SystemAllocator, add_control_tube: &mut impl FnMut(AnyControlTube), hotplug_manager: &mut PciHotPlugManager, net_param: NetParameters, ) -> Result<u8>2657 fn add_hotplug_net<V: VmArch, Vcpu: VcpuArch>(
2658     linux: &mut RunnableLinuxVm<V, Vcpu>,
2659     sys_allocator: &mut SystemAllocator,
2660     add_control_tube: &mut impl FnMut(AnyControlTube),
2661     hotplug_manager: &mut PciHotPlugManager,
2662     net_param: NetParameters,
2663 ) -> Result<u8> {
2664     let (msi_host_tube, msi_device_tube) = Tube::pair().context("create tube")?;
2665     add_control_tube(AnyControlTube::IrqTube(msi_host_tube));
2666     let (ioevent_host_tube, ioevent_device_tube) = Tube::pair().context("create tube")?;
2667     let ioevent_vm_memory_client = VmMemoryClient::new(ioevent_device_tube);
2668     add_control_tube(
2669         VmMemoryTube {
2670             tube: ioevent_host_tube,
2671             expose_with_viommu: false,
2672         }
2673         .into(),
2674     );
2675     let (vm_control_host_tube, vm_control_device_tube) = Tube::pair().context("create tube")?;
2676     add_control_tube(TaggedControlTube::Vm(vm_control_host_tube).into());
2677     let net_carrier_device = NetResourceCarrier::new(
2678         net_param,
2679         msi_device_tube,
2680         ioevent_vm_memory_client,
2681         vm_control_device_tube,
2682     );
2683     hotplug_manager.hotplug_device(
2684         vec![ResourceCarrier::VirtioNet(net_carrier_device)],
2685         linux,
2686         sys_allocator,
2687     )
2688 }
2689 
2690 #[cfg(feature = "pci-hotplug")]
handle_hotplug_net_command<V: VmArch, Vcpu: VcpuArch>( net_cmd: NetControlCommand, linux: &mut RunnableLinuxVm<V, Vcpu>, sys_allocator: &mut SystemAllocator, add_control_tube: &mut impl FnMut(AnyControlTube), hotplug_manager: &mut PciHotPlugManager, ) -> VmResponse2691 fn handle_hotplug_net_command<V: VmArch, Vcpu: VcpuArch>(
2692     net_cmd: NetControlCommand,
2693     linux: &mut RunnableLinuxVm<V, Vcpu>,
2694     sys_allocator: &mut SystemAllocator,
2695     add_control_tube: &mut impl FnMut(AnyControlTube),
2696     hotplug_manager: &mut PciHotPlugManager,
2697 ) -> VmResponse {
2698     match net_cmd {
2699         NetControlCommand::AddTap(tap_name) => handle_hotplug_net_add(
2700             linux,
2701             sys_allocator,
2702             add_control_tube,
2703             hotplug_manager,
2704             &tap_name,
2705         ),
2706         NetControlCommand::RemoveTap(bus) => {
2707             handle_hotplug_net_remove(linux, sys_allocator, hotplug_manager, bus)
2708         }
2709     }
2710 }
2711 
2712 #[cfg(feature = "pci-hotplug")]
handle_hotplug_net_add<V: VmArch, Vcpu: VcpuArch>( linux: &mut RunnableLinuxVm<V, Vcpu>, sys_allocator: &mut SystemAllocator, add_control_tube: &mut impl FnMut(AnyControlTube), hotplug_manager: &mut PciHotPlugManager, tap_name: &str, ) -> VmResponse2713 fn handle_hotplug_net_add<V: VmArch, Vcpu: VcpuArch>(
2714     linux: &mut RunnableLinuxVm<V, Vcpu>,
2715     sys_allocator: &mut SystemAllocator,
2716     add_control_tube: &mut impl FnMut(AnyControlTube),
2717     hotplug_manager: &mut PciHotPlugManager,
2718     tap_name: &str,
2719 ) -> VmResponse {
2720     let net_param_mode = NetParametersMode::TapName {
2721         tap_name: tap_name.to_owned(),
2722         mac: None,
2723     };
2724     let net_param = NetParameters {
2725         mode: net_param_mode,
2726         vhost_net: None,
2727         vq_pairs: None,
2728         packed_queue: false,
2729         pci_address: None,
2730     };
2731     let ret = add_hotplug_net(
2732         linux,
2733         sys_allocator,
2734         add_control_tube,
2735         hotplug_manager,
2736         net_param,
2737     );
2738 
2739     match ret {
2740         Ok(pci_bus) => VmResponse::PciHotPlugResponse { bus: pci_bus },
2741         Err(e) => VmResponse::ErrString(format!("{:?}", e)),
2742     }
2743 }
2744 
2745 #[cfg(feature = "pci-hotplug")]
handle_hotplug_net_remove<V: VmArch, Vcpu: VcpuArch>( linux: &mut RunnableLinuxVm<V, Vcpu>, sys_allocator: &mut SystemAllocator, hotplug_manager: &mut PciHotPlugManager, bus: u8, ) -> VmResponse2746 fn handle_hotplug_net_remove<V: VmArch, Vcpu: VcpuArch>(
2747     linux: &mut RunnableLinuxVm<V, Vcpu>,
2748     sys_allocator: &mut SystemAllocator,
2749     hotplug_manager: &mut PciHotPlugManager,
2750     bus: u8,
2751 ) -> VmResponse {
2752     match hotplug_manager.remove_hotplug_device(bus, linux, sys_allocator) {
2753         Ok(_) => VmResponse::Ok,
2754         Err(e) => VmResponse::ErrString(format!("{:?}", e)),
2755     }
2756 }
2757 
2758 #[cfg(target_arch = "x86_64")]
remove_hotplug_bridge<V: VmArch, Vcpu: VcpuArch>( linux: &RunnableLinuxVm<V, Vcpu>, sys_allocator: &mut SystemAllocator, buses_to_remove: &mut Vec<u8>, hotplug_key: HotPlugKey, child_bus: u8, ) -> Result<()>2759 fn remove_hotplug_bridge<V: VmArch, Vcpu: VcpuArch>(
2760     linux: &RunnableLinuxVm<V, Vcpu>,
2761     sys_allocator: &mut SystemAllocator,
2762     buses_to_remove: &mut Vec<u8>,
2763     hotplug_key: HotPlugKey,
2764     child_bus: u8,
2765 ) -> Result<()> {
2766     for (bus_num, hp_bus) in linux.hotplug_bus.iter() {
2767         let mut hp_bus_lock = hp_bus.lock();
2768         if let Some(pci_addr) = hp_bus_lock.get_hotplug_device(hotplug_key) {
2769             sys_allocator.release_pci(pci_addr);
2770             hp_bus_lock.hot_unplug(pci_addr)?;
2771             buses_to_remove.push(child_bus);
2772             if hp_bus_lock.is_empty() {
2773                 if let Some(hotplug_key) = hp_bus_lock.get_hotplug_key() {
2774                     remove_hotplug_bridge(
2775                         linux,
2776                         sys_allocator,
2777                         buses_to_remove,
2778                         hotplug_key,
2779                         *bus_num,
2780                     )?;
2781                 }
2782             }
2783             return Ok(());
2784         }
2785     }
2786 
2787     Err(anyhow!(
2788         "Can not find device {:?} on hotplug buses",
2789         hotplug_key
2790     ))
2791 }
2792 
2793 #[cfg(target_arch = "x86_64")]
remove_hotplug_device<V: VmArch, Vcpu: VcpuArch>( linux: &mut RunnableLinuxVm<V, Vcpu>, sys_allocator: &mut SystemAllocator, iommu_host_tube: Option<&Tube>, device: &HotPlugDeviceInfo, ) -> Result<()>2794 fn remove_hotplug_device<V: VmArch, Vcpu: VcpuArch>(
2795     linux: &mut RunnableLinuxVm<V, Vcpu>,
2796     sys_allocator: &mut SystemAllocator,
2797     iommu_host_tube: Option<&Tube>,
2798     device: &HotPlugDeviceInfo,
2799 ) -> Result<()> {
2800     let host_addr = PciAddress::from_path(&device.path)?;
2801     let hotplug_key = match device.device_type {
2802         HotPlugDeviceType::UpstreamPort => HotPlugKey::HostUpstreamPort { host_addr },
2803         HotPlugDeviceType::DownstreamPort => HotPlugKey::HostDownstreamPort { host_addr },
2804         HotPlugDeviceType::EndPoint => HotPlugKey::HostVfio { host_addr },
2805     };
2806 
2807     let hp_bus = linux
2808         .hotplug_bus
2809         .iter()
2810         .find(|(_, hp_bus)| {
2811             let hp_bus = hp_bus.lock();
2812             hp_bus.get_hotplug_device(hotplug_key).is_some()
2813         })
2814         .map(|(bus_num, hp_bus)| (*bus_num, hp_bus.clone()));
2815 
2816     if let Some((bus_num, hp_bus)) = hp_bus {
2817         let mut buses_to_remove = Vec::new();
2818         let mut removed_key = None;
2819         let mut hp_bus_lock = hp_bus.lock();
2820         if let Some(pci_addr) = hp_bus_lock.get_hotplug_device(hotplug_key) {
2821             if let Some(iommu_host_tube) = iommu_host_tube {
2822                 let request =
2823                     VirtioIOMMURequest::VfioCommand(VirtioIOMMUVfioCommand::VfioDeviceDel {
2824                         endpoint_addr: pci_addr.to_u32(),
2825                     });
2826                 match virtio_iommu_request(iommu_host_tube, &request)
2827                     .map_err(|_| VirtioIOMMUVfioError::SocketFailed)?
2828                 {
2829                     VirtioIOMMUResponse::VfioResponse(VirtioIOMMUVfioResult::Ok) => (),
2830                     resp => bail!("Unexpected message response: {:?}", resp),
2831                 }
2832             }
2833             let mut empty_simbling = true;
2834             if let Some(HotPlugKey::HostDownstreamPort { host_addr }) =
2835                 hp_bus_lock.get_hotplug_key()
2836             {
2837                 let addr_alias = host_addr;
2838                 for (simbling_bus_num, hp_bus) in linux.hotplug_bus.iter() {
2839                     if *simbling_bus_num != bus_num {
2840                         let hp_bus_lock = hp_bus.lock();
2841                         let hotplug_key = hp_bus_lock.get_hotplug_key();
2842                         if let Some(HotPlugKey::HostDownstreamPort { host_addr }) = hotplug_key {
2843                             if addr_alias.bus == host_addr.bus && !hp_bus_lock.is_empty() {
2844                                 empty_simbling = false;
2845                                 break;
2846                             }
2847                         }
2848                     }
2849                 }
2850             }
2851 
2852             // If all simbling downstream ports are empty, do not send hot unplug event for this
2853             // downstream port. Root port will send one plug out interrupt and remove all
2854             // the remaining devices
2855             if !empty_simbling {
2856                 hp_bus_lock.hot_unplug(pci_addr)?;
2857             }
2858 
2859             sys_allocator.release_pci(pci_addr);
2860             if empty_simbling || hp_bus_lock.is_empty() {
2861                 if let Some(hotplug_key) = hp_bus_lock.get_hotplug_key() {
2862                     removed_key = Some(hotplug_key);
2863                     remove_hotplug_bridge(
2864                         linux,
2865                         sys_allocator,
2866                         &mut buses_to_remove,
2867                         hotplug_key,
2868                         bus_num,
2869                     )?;
2870                 }
2871             }
2872         }
2873 
2874         // Some types of TBT device has a few empty downstream ports. The emulated bridges
2875         // of these ports won't be removed since no vfio device is connected to our emulated
2876         // bridges. So we explicitly check all simbling bridges of the removed bridge here,
2877         // and remove them if bridge has no child device connected.
2878         if let Some(HotPlugKey::HostDownstreamPort { host_addr }) = removed_key {
2879             let addr_alias = host_addr;
2880             for (simbling_bus_num, hp_bus) in linux.hotplug_bus.iter() {
2881                 if *simbling_bus_num != bus_num {
2882                     let hp_bus_lock = hp_bus.lock();
2883                     let hotplug_key = hp_bus_lock.get_hotplug_key();
2884                     if let Some(HotPlugKey::HostDownstreamPort { host_addr }) = hotplug_key {
2885                         if addr_alias.bus == host_addr.bus && hp_bus_lock.is_empty() {
2886                             remove_hotplug_bridge(
2887                                 linux,
2888                                 sys_allocator,
2889                                 &mut buses_to_remove,
2890                                 hotplug_key.unwrap(),
2891                                 *simbling_bus_num,
2892                             )?;
2893                         }
2894                     }
2895                 }
2896             }
2897         }
2898         for bus in buses_to_remove.iter() {
2899             linux.hotplug_bus.remove(bus);
2900         }
2901         return Ok(());
2902     }
2903 
2904     Err(anyhow!(
2905         "Can not find device {:?} on hotplug buses",
2906         hotplug_key
2907     ))
2908 }
2909 
trigger_vm_suspend_and_wait_for_entry( guest_suspended_cvar: Arc<(Mutex<bool>, Condvar)>, tube: &SendTube, response: vm_control::VmResponse, suspend_tube: Arc<Mutex<SendTube>>, pm: Option<Arc<Mutex<dyn PmResource + Send>>>, )2910 pub fn trigger_vm_suspend_and_wait_for_entry(
2911     guest_suspended_cvar: Arc<(Mutex<bool>, Condvar)>,
2912     tube: &SendTube,
2913     response: vm_control::VmResponse,
2914     suspend_tube: Arc<Mutex<SendTube>>,
2915     pm: Option<Arc<Mutex<dyn PmResource + Send>>>,
2916 ) {
2917     let (lock, cvar) = &*guest_suspended_cvar;
2918     let mut guest_suspended = lock.lock();
2919 
2920     *guest_suspended = false;
2921 
2922     // During suspend also emulate sleepbtn, which allows to suspend VM (if running e.g. acpid and
2923     // reacts on sleep button events)
2924     if let Some(pm) = pm {
2925         pm.lock().slpbtn_evt();
2926     } else {
2927         error!("generating sleepbtn during suspend not supported");
2928     }
2929 
2930     // Wait for notification about guest suspension, if not received after 15sec,
2931     // proceed anyway.
2932     let result = cvar.wait_timeout(guest_suspended, std::time::Duration::from_secs(15));
2933     guest_suspended = result.0;
2934 
2935     if result.1.timed_out() {
2936         warn!("Guest suspension timeout - proceeding anyway");
2937     } else if *guest_suspended {
2938         info!("Guest suspended");
2939     }
2940 
2941     if let Err(e) = suspend_tube.lock().send(&true) {
2942         error!("failed to trigger suspend event: {}", e);
2943     }
2944     // Now we ready to send response over the tube and communicate that VM suspend has finished
2945     if let Err(e) = tube.send(&response) {
2946         error!("failed to send VmResponse: {}", e);
2947     }
2948 }
2949 
2950 #[cfg(feature = "pvclock")]
2951 #[derive(Debug)]
2952 /// The action requested by the pvclock device to perform on the main thread.
2953 enum PvClockAction {
2954     #[cfg(target_arch = "aarch64")]
2955     /// Update the counter offset with VmAarch64::set_counter_offset.
2956     SetCounterOffset(u64),
2957 }
2958 
2959 #[cfg(feature = "pvclock")]
send_pvclock_cmd(tube: &Tube, command: PvClockCommand) -> Result<Option<PvClockAction>>2960 fn send_pvclock_cmd(tube: &Tube, command: PvClockCommand) -> Result<Option<PvClockAction>> {
2961     tube.send(&command)
2962         .with_context(|| format!("failed to send pvclock command {:?}", command))?;
2963     let resp = tube
2964         .recv::<PvClockCommandResponse>()
2965         .context("failed to receive pvclock command response")?;
2966     match resp {
2967         PvClockCommandResponse::Err(e) => {
2968             bail!("pvclock encountered error on {:?}: {}", command, e);
2969         }
2970         PvClockCommandResponse::DeviceInactive => {
2971             warn!("Tried to send {command:?} but pvclock device was inactive");
2972             Ok(None)
2973         }
2974         PvClockCommandResponse::Resumed {
2975             total_suspended_ticks,
2976         } => {
2977             info!("{command:?} completed with {total_suspended_ticks} total_suspended_ticks");
2978             cfg_if::cfg_if! {
2979                 if #[cfg(target_arch = "aarch64")] {
2980                     Ok(Some(PvClockAction::SetCounterOffset(total_suspended_ticks)))
2981                 } else {
2982                     // For non-AArch64 platforms this is handled by directly updating the offset in
2983                     // shared memory in the pvclock device worker.
2984                     Ok(None)
2985                 }
2986             }
2987         }
2988         PvClockCommandResponse::Ok => {
2989             info!("{command:?} completed with {resp:?}");
2990             Ok(None)
2991         }
2992     }
2993 }
2994 
2995 #[cfg(target_arch = "x86_64")]
handle_hotplug_command<V: VmArch, Vcpu: VcpuArch>( linux: &mut RunnableLinuxVm<V, Vcpu>, sys_allocator: &mut SystemAllocator, cfg: &Config, add_control_tube: &mut impl FnMut(AnyControlTube), hp_control_tube: &mpsc::Sender<PciRootCommand>, iommu_host_tube: Option<&Tube>, device: &HotPlugDeviceInfo, add: bool, #[cfg(feature = "swap")] swap_controller: &mut Option<SwapController>, vfio_container_manager: &mut VfioContainerManager, ) -> VmResponse2996 fn handle_hotplug_command<V: VmArch, Vcpu: VcpuArch>(
2997     linux: &mut RunnableLinuxVm<V, Vcpu>,
2998     sys_allocator: &mut SystemAllocator,
2999     cfg: &Config,
3000     add_control_tube: &mut impl FnMut(AnyControlTube),
3001     hp_control_tube: &mpsc::Sender<PciRootCommand>,
3002     iommu_host_tube: Option<&Tube>,
3003     device: &HotPlugDeviceInfo,
3004     add: bool,
3005     #[cfg(feature = "swap")] swap_controller: &mut Option<SwapController>,
3006     vfio_container_manager: &mut VfioContainerManager,
3007 ) -> VmResponse {
3008     let iommu_host_tube = if cfg.vfio_isolate_hotplug {
3009         iommu_host_tube
3010     } else {
3011         None
3012     };
3013 
3014     let ret = if add {
3015         add_hotplug_device(
3016             linux,
3017             sys_allocator,
3018             cfg,
3019             add_control_tube,
3020             hp_control_tube,
3021             iommu_host_tube,
3022             device,
3023             #[cfg(feature = "swap")]
3024             swap_controller,
3025             vfio_container_manager,
3026         )
3027     } else {
3028         remove_hotplug_device(linux, sys_allocator, iommu_host_tube, device)
3029     };
3030 
3031     match ret {
3032         Ok(()) => VmResponse::Ok,
3033         Err(e) => {
3034             error!("handle_hotplug_command failure: {}", e);
3035             VmResponse::Err(base::Error::new(libc::EINVAL))
3036         }
3037     }
3038 }
3039 
3040 struct ControlLoopState<'a, V: VmArch, Vcpu: VcpuArch> {
3041     linux: &'a mut RunnableLinuxVm<V, Vcpu>,
3042     cfg: &'a Config,
3043     sys_allocator: &'a Arc<Mutex<SystemAllocator>>,
3044     control_tubes: &'a BTreeMap<usize, TaggedControlTube>,
3045     disk_host_tubes: &'a [Tube],
3046     #[cfg(feature = "audio")]
3047     snd_host_tubes: &'a [Tube],
3048     #[cfg(feature = "gpu")]
3049     gpu_control_tube: Option<&'a Tube>,
3050     #[cfg(feature = "usb")]
3051     usb_control_tube: &'a Tube,
3052     #[cfg(target_arch = "x86_64")]
3053     iommu_host_tube: &'a Option<Arc<Mutex<Tube>>>,
3054     #[cfg(target_arch = "x86_64")]
3055     hp_control_tube: &'a mpsc::Sender<PciRootCommand>,
3056     guest_suspended_cvar: &'a Option<Arc<(Mutex<bool>, Condvar)>>,
3057     #[cfg(feature = "pci-hotplug")]
3058     hotplug_manager: &'a mut Option<PciHotPlugManager>,
3059     #[cfg(feature = "swap")]
3060     swap_controller: &'a mut Option<SwapController>,
3061     vcpu_handles: &'a [(JoinHandle<()>, mpsc::Sender<vm_control::VcpuControl>)],
3062     #[cfg(feature = "balloon")]
3063     balloon_tube: Option<&'a mut BalloonTube>,
3064     device_ctrl_tube: &'a Tube,
3065     irq_handler_control: &'a Tube,
3066     #[cfg(any(target_arch = "x86_64", feature = "pci-hotplug"))]
3067     vm_memory_handler_control: &'a Tube,
3068     #[cfg(feature = "registered_events")]
3069     registered_evt_tubes: &'a mut HashMap<RegisteredEvent, HashSet<AddressedProtoTube>>,
3070     #[cfg(feature = "pvclock")]
3071     pvclock_host_tube: Option<Arc<Tube>>,
3072     vfio_container_manager: &'a mut VfioContainerManager,
3073     suspended_pvclock_state: &'a mut Option<hypervisor::ClockState>,
3074     vcpus_pid_tid: &'a BTreeMap<usize, (u32, u32)>,
3075 }
3076 
3077 struct VmRequestResult {
3078     response: Option<VmResponse>,
3079     exit: bool,
3080 }
3081 
3082 impl VmRequestResult {
new(response: Option<VmResponse>, exit: bool) -> Self3083     fn new(response: Option<VmResponse>, exit: bool) -> Self {
3084         VmRequestResult { response, exit }
3085     }
3086 }
3087 
process_vm_request<V: VmArch + 'static, Vcpu: VcpuArch + 'static>( state: &mut ControlLoopState<V, Vcpu>, id: usize, tube: &Tube, request: VmRequest, #[cfg_attr( not(any(target_arch = "x86_64", feature = "pci-hotplug")), allow(unused_variables, clippy::ptr_arg) )] add_tubes: &mut Vec<TaggedControlTube>, ) -> Result<VmRequestResult>3088 fn process_vm_request<V: VmArch + 'static, Vcpu: VcpuArch + 'static>(
3089     state: &mut ControlLoopState<V, Vcpu>,
3090     id: usize,
3091     tube: &Tube,
3092     request: VmRequest,
3093     #[cfg_attr(
3094         not(any(target_arch = "x86_64", feature = "pci-hotplug")),
3095         allow(unused_variables, clippy::ptr_arg)
3096     )]
3097     add_tubes: &mut Vec<TaggedControlTube>,
3098 ) -> Result<VmRequestResult> {
3099     #[cfg(any(target_arch = "x86_64", feature = "pci-hotplug"))]
3100     let mut add_irq_control_tubes = Vec::new();
3101     #[cfg(any(target_arch = "x86_64", feature = "pci-hotplug"))]
3102     let mut add_vm_memory_control_tubes = Vec::new();
3103 
3104     #[cfg(any(target_arch = "x86_64", feature = "pci-hotplug"))]
3105     let mut add_control_tube = |t| match t {
3106         AnyControlTube::DeviceControlTube(_) => {
3107             panic!("hotplugging DeviceControlTube not supported yet")
3108         }
3109         AnyControlTube::IrqTube(t) => add_irq_control_tubes.push(t),
3110         AnyControlTube::TaggedControlTube(t) => add_tubes.push(t),
3111         AnyControlTube::VmMemoryTube(t) => add_vm_memory_control_tubes.push(t),
3112     };
3113 
3114     let response = match request {
3115         VmRequest::Exit => {
3116             return Ok(VmRequestResult::new(Some(VmResponse::Ok), true));
3117         }
3118         VmRequest::HotPlugVfioCommand { device, add } => {
3119             #[cfg(target_arch = "x86_64")]
3120             {
3121                 handle_hotplug_command(
3122                     state.linux,
3123                     &mut state.sys_allocator.lock(),
3124                     state.cfg,
3125                     &mut add_control_tube,
3126                     state.hp_control_tube,
3127                     state.iommu_host_tube.as_ref().map(|t| t.lock()).as_deref(),
3128                     &device,
3129                     add,
3130                     #[cfg(feature = "swap")]
3131                     state.swap_controller,
3132                     state.vfio_container_manager,
3133                 )
3134             }
3135 
3136             #[cfg(not(target_arch = "x86_64"))]
3137             {
3138                 // Suppress warnings.
3139                 let _ = (device, add);
3140                 let _ = &state.vfio_container_manager;
3141                 VmResponse::Ok
3142             }
3143         }
3144         #[cfg(feature = "pci-hotplug")]
3145         VmRequest::HotPlugNetCommand(net_cmd) => {
3146             if let Some(hotplug_manager) = state.hotplug_manager.as_mut() {
3147                 handle_hotplug_net_command(
3148                     net_cmd,
3149                     state.linux,
3150                     &mut state.sys_allocator.lock(),
3151                     &mut add_control_tube,
3152                     hotplug_manager,
3153                 )
3154             } else {
3155                 VmResponse::ErrString("PCI hotplug is not enabled.".to_owned())
3156             }
3157         }
3158         #[cfg(feature = "registered_events")]
3159         VmRequest::RegisterListener { socket_addr, event } => {
3160             let (registered_tube, already_registered) =
3161                 find_registered_tube(state.registered_evt_tubes, &socket_addr, event);
3162 
3163             if !already_registered {
3164                 let addr_tube = make_addr_tube_from_maybe_existing(registered_tube, socket_addr)?;
3165 
3166                 if let Some(tubes) = state.registered_evt_tubes.get_mut(&event) {
3167                     tubes.insert(addr_tube);
3168                 } else {
3169                     state
3170                         .registered_evt_tubes
3171                         .insert(event, vec![addr_tube].into_iter().collect());
3172                 }
3173             }
3174             VmResponse::Ok
3175         }
3176         #[cfg(feature = "registered_events")]
3177         VmRequest::UnregisterListener { socket_addr, event } => {
3178             if let Some(tubes) = state.registered_evt_tubes.get_mut(&event) {
3179                 tubes.retain(|t| t.socket_addr != socket_addr);
3180             }
3181             state
3182                 .registered_evt_tubes
3183                 .retain(|_, tubes| !tubes.is_empty());
3184             VmResponse::Ok
3185         }
3186         #[cfg(feature = "registered_events")]
3187         VmRequest::Unregister { socket_addr } => {
3188             for (_, tubes) in state.registered_evt_tubes.iter_mut() {
3189                 tubes.retain(|t| t.socket_addr != socket_addr);
3190             }
3191             state
3192                 .registered_evt_tubes
3193                 .retain(|_, tubes| !tubes.is_empty());
3194             VmResponse::Ok
3195         }
3196         #[cfg(feature = "balloon")]
3197         VmRequest::BalloonCommand(cmd) => {
3198             if let Some(tube) = state.balloon_tube.as_mut() {
3199                 let Some((r, key)) = tube.send_cmd(cmd, Some(id)) else {
3200                     return Ok(VmRequestResult::new(None, false));
3201                 };
3202                 if key != id {
3203                     let Some(TaggedControlTube::Vm(tube)) = state.control_tubes.get(&key) else {
3204                         return Ok(VmRequestResult::new(None, false));
3205                     };
3206                     if let Err(e) = tube.send(&r) {
3207                         error!("failed to send VmResponse: {}", e);
3208                     }
3209                     return Ok(VmRequestResult::new(None, false));
3210                 }
3211                 r
3212             } else {
3213                 VmResponse::Err(base::Error::new(libc::ENOTSUP))
3214             }
3215         }
3216         VmRequest::VcpuPidTid => VmResponse::VcpuPidTidResponse {
3217             pid_tid_map: state.vcpus_pid_tid.clone(),
3218         },
3219         VmRequest::Throttle(vcpu, cycles) => {
3220             vcpu::kick_vcpu(
3221                 &state.vcpu_handles.get(vcpu),
3222                 state.linux.irq_chip.as_irq_chip(),
3223                 VcpuControl::Throttle(cycles),
3224             );
3225             return Ok(VmRequestResult::new(None, false));
3226         }
3227         _ => {
3228             if !state.cfg.force_s2idle {
3229                 #[cfg(feature = "pvclock")]
3230                 if let Some(ref pvclock_host_tube) = state.pvclock_host_tube {
3231                     // Update clock offset when pvclock is used.
3232                     if let VmRequest::ResumeVcpus = request {
3233                         let cmd = PvClockCommand::Resume;
3234                         match send_pvclock_cmd(pvclock_host_tube, cmd.clone()) {
3235                             Ok(action) => {
3236                                 info!("{:?} command successfully processed", cmd);
3237                                 if let Some(action) = action {
3238                                     match action {
3239                                         #[cfg(target_arch = "aarch64")]
3240                                         PvClockAction::SetCounterOffset(offset) => {
3241                                             state.linux.vm.set_counter_offset(offset)?;
3242                                         }
3243                                     }
3244                                 }
3245                             }
3246                             Err(e) => error!("{:?} command failed: {:#}", cmd, e),
3247                         };
3248                     }
3249                 }
3250             }
3251             let kick_all_vcpus = |msg| {
3252                 if let VcpuControl::RunState(VmRunMode::Running) = msg {
3253                     for dev in &state.linux.resume_notify_devices {
3254                         dev.lock().resume_imminent();
3255                     }
3256                 }
3257                 vcpu::kick_all_vcpus(state.vcpu_handles, state.linux.irq_chip.as_irq_chip(), msg);
3258             };
3259             let response = request.execute(
3260                 &state.linux.vm,
3261                 state.disk_host_tubes,
3262                 #[cfg(feature = "audio")]
3263                 state.snd_host_tubes,
3264                 #[cfg(not(feature = "audio"))]
3265                 &[],
3266                 &mut state.linux.pm,
3267                 #[cfg(feature = "gpu")]
3268                 state.gpu_control_tube,
3269                 #[cfg(not(feature = "gpu"))]
3270                 None,
3271                 #[cfg(feature = "usb")]
3272                 Some(state.usb_control_tube),
3273                 #[cfg(not(feature = "usb"))]
3274                 None,
3275                 &mut state.linux.bat_control,
3276                 kick_all_vcpus,
3277                 |index, msg| {
3278                     vcpu::kick_vcpu(
3279                         &state.vcpu_handles.get(index),
3280                         state.linux.irq_chip.as_irq_chip(),
3281                         msg,
3282                     )
3283                 },
3284                 state.cfg.force_s2idle,
3285                 #[cfg(feature = "swap")]
3286                 state.swap_controller.as_ref(),
3287                 state.device_ctrl_tube,
3288                 state.vcpu_handles.len(),
3289                 state.irq_handler_control,
3290                 || state.linux.irq_chip.snapshot(state.linux.vcpu_count),
3291                 state.suspended_pvclock_state,
3292             );
3293             if state.cfg.force_s2idle {
3294                 if let VmRequest::SuspendVcpus = request {
3295                     // Spawn s2idle wait thread.
3296                     let send_tube = tube.try_clone_send_tube().unwrap();
3297                     let suspend_tube = state.linux.suspend_tube.0.clone();
3298                     let guest_suspended_cvar = state.guest_suspended_cvar.clone();
3299                     let pm = state.linux.pm.clone();
3300 
3301                     std::thread::Builder::new()
3302                         .name("s2idle_wait".to_owned())
3303                         .spawn(move || {
3304                             trigger_vm_suspend_and_wait_for_entry(
3305                                 guest_suspended_cvar.unwrap(),
3306                                 &send_tube,
3307                                 response,
3308                                 suspend_tube,
3309                                 pm,
3310                             )
3311                         })
3312                         .context("failed to spawn s2idle_wait thread")?;
3313 
3314                     // For s2idle, omit the response since it will be sent by
3315                     // s2idle_wait thread when suspension actually happens.
3316                     return Ok(VmRequestResult::new(None, false));
3317                 }
3318             } else {
3319                 #[cfg(feature = "pvclock")]
3320                 if let Some(ref pvclock_host_tube) = state.pvclock_host_tube {
3321                     // Record the time after VCPUs are suspended to track suspension duration.
3322                     if let VmRequest::SuspendVcpus = request {
3323                         let cmd = PvClockCommand::Suspend;
3324                         match send_pvclock_cmd(pvclock_host_tube, cmd.clone()) {
3325                             Ok(action) => {
3326                                 info!("{:?} command successfully processed", cmd);
3327                                 if let Some(action) = action {
3328                                     error!("Unexpected action {:?} requested for suspend", action);
3329                                 }
3330                             }
3331                             Err(e) => error!("{:?} command failed: {:#}", cmd, e),
3332                         };
3333                     }
3334                 }
3335             }
3336             response
3337         }
3338     };
3339 
3340     cfg_if::cfg_if! {
3341         if #[cfg(any(target_arch = "x86_64", feature = "pci-hotplug"))] {
3342             if !add_irq_control_tubes.is_empty() {
3343                 state
3344                     .irq_handler_control
3345                     .send(&IrqHandlerRequest::AddIrqControlTubes(
3346                         add_irq_control_tubes,
3347                     ))?;
3348             }
3349             if !add_vm_memory_control_tubes.is_empty() {
3350                 state
3351                     .vm_memory_handler_control
3352                     .send(&VmMemoryHandlerRequest::AddControlTubes(
3353                         add_vm_memory_control_tubes,
3354                     ))?;
3355             }
3356         }
3357     }
3358 
3359     Ok(VmRequestResult::new(Some(response), false))
3360 }
3361 
process_vm_control_event<V: VmArch + 'static, Vcpu: VcpuArch + 'static>( state: &mut ControlLoopState<V, Vcpu>, id: usize, socket: &TaggedControlTube, ) -> Result<(bool, Vec<usize>, Vec<TaggedControlTube>)>3362 fn process_vm_control_event<V: VmArch + 'static, Vcpu: VcpuArch + 'static>(
3363     state: &mut ControlLoopState<V, Vcpu>,
3364     id: usize,
3365     socket: &TaggedControlTube,
3366 ) -> Result<(bool, Vec<usize>, Vec<TaggedControlTube>)> {
3367     let mut vm_control_ids_to_remove = Vec::new();
3368     let mut add_tubes = Vec::new();
3369     match socket {
3370         TaggedControlTube::Vm(tube) => match tube.recv::<VmRequest>() {
3371             Ok(request) => {
3372                 let res = process_vm_request(state, id, tube, request, &mut add_tubes)?;
3373 
3374                 if let Some(response) = res.response {
3375                     if let Err(e) = tube.send(&response) {
3376                         error!("failed to send VmResponse: {}", e);
3377                     }
3378                 }
3379 
3380                 if res.exit {
3381                     return Ok((true, Vec::new(), Vec::new()));
3382                 }
3383             }
3384             Err(e) => {
3385                 if let TubeError::Disconnected = e {
3386                     vm_control_ids_to_remove.push(id);
3387                 } else {
3388                     error!("failed to recv VmRequest: {}", e);
3389                 }
3390             }
3391         },
3392         TaggedControlTube::VmMsync(tube) => match tube.recv::<VmMemoryMappingRequest>() {
3393             Ok(request) => {
3394                 let response = request.execute(&mut state.linux.vm);
3395                 if let Err(e) = tube.send(&response) {
3396                     error!("failed to send VmMsyncResponse: {}", e);
3397                 }
3398             }
3399             Err(e) => {
3400                 if let TubeError::Disconnected = e {
3401                     vm_control_ids_to_remove.push(id);
3402                 } else {
3403                     error!("failed to recv VmMsyncRequest: {}", e);
3404                 }
3405             }
3406         },
3407         TaggedControlTube::Fs(tube) => match tube.recv::<FsMappingRequest>() {
3408             Ok(request) => {
3409                 let response =
3410                     request.execute(&mut state.linux.vm, &mut state.sys_allocator.lock());
3411                 if let Err(e) = tube.send(&response) {
3412                     error!("failed to send VmResponse: {}", e);
3413                 }
3414             }
3415             Err(e) => {
3416                 if let TubeError::Disconnected = e {
3417                     vm_control_ids_to_remove.push(id);
3418                 } else {
3419                     error!("failed to recv VmResponse: {}", e);
3420                 }
3421             }
3422         },
3423     }
3424 
3425     Ok((false, vm_control_ids_to_remove, add_tubes))
3426 }
3427 
3428 #[cfg(feature = "registered_events")]
3429 struct AddressedProtoTube {
3430     tube: Rc<ProtoTube>,
3431     socket_addr: String,
3432 }
3433 
3434 #[cfg(feature = "registered_events")]
3435 impl PartialEq for AddressedProtoTube {
eq(&self, other: &Self) -> bool3436     fn eq(&self, other: &Self) -> bool {
3437         self.socket_addr == other.socket_addr
3438     }
3439 }
3440 
3441 #[cfg(feature = "registered_events")]
3442 impl Eq for AddressedProtoTube {}
3443 
3444 #[cfg(feature = "registered_events")]
3445 impl Hash for AddressedProtoTube {
hash<H: std::hash::Hasher>(&self, state: &mut H)3446     fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
3447         self.socket_addr.hash(state);
3448     }
3449 }
3450 
3451 #[cfg(feature = "registered_events")]
3452 impl AddressedProtoTube {
send<M: protobuf::Message>(&self, msg: &M) -> Result<(), base::TubeError>3453     pub fn send<M: protobuf::Message>(&self, msg: &M) -> Result<(), base::TubeError> {
3454         self.tube.send_proto(msg)
3455     }
3456 }
3457 
3458 #[cfg(feature = "registered_events")]
find_registered_tube<'a>( registered_tubes: &'a HashMap<RegisteredEvent, HashSet<AddressedProtoTube>>, socket_addr: &str, event: RegisteredEvent, ) -> (Option<&'a Rc<ProtoTube>>, bool)3459 fn find_registered_tube<'a>(
3460     registered_tubes: &'a HashMap<RegisteredEvent, HashSet<AddressedProtoTube>>,
3461     socket_addr: &str,
3462     event: RegisteredEvent,
3463 ) -> (Option<&'a Rc<ProtoTube>>, bool) {
3464     let mut registered_tube: Option<&Rc<ProtoTube>> = None;
3465     let mut already_registered = false;
3466     'outer: for (evt, addr_tubes) in registered_tubes {
3467         for addr_tube in addr_tubes {
3468             if addr_tube.socket_addr == socket_addr {
3469                 if *evt == event {
3470                     already_registered = true;
3471                     break 'outer;
3472                 }
3473                 // Since all tubes of the same addr should
3474                 // be an RC to the same tube, it doesn't
3475                 // matter which one we get. But we do need
3476                 // to check for a registration for the
3477                 // current event, so can't break here.
3478                 registered_tube = Some(&addr_tube.tube);
3479             }
3480         }
3481     }
3482     (registered_tube, already_registered)
3483 }
3484 
3485 #[cfg(feature = "registered_events")]
make_addr_tube_from_maybe_existing( tube: Option<&Rc<ProtoTube>>, addr: String, ) -> Result<AddressedProtoTube>3486 fn make_addr_tube_from_maybe_existing(
3487     tube: Option<&Rc<ProtoTube>>,
3488     addr: String,
3489 ) -> Result<AddressedProtoTube> {
3490     if let Some(registered_tube) = tube {
3491         Ok(AddressedProtoTube {
3492             tube: registered_tube.clone(),
3493             socket_addr: addr,
3494         })
3495     } else {
3496         let sock = UnixSeqpacket::connect(addr.clone()).with_context(|| {
3497             format!("failed to connect to registered listening socket {}", addr)
3498         })?;
3499         let tube = ProtoTube::from(Tube::try_from(sock)?);
3500         Ok(AddressedProtoTube {
3501             tube: Rc::new(tube),
3502             socket_addr: addr,
3503         })
3504     }
3505 }
3506 
run_control<V: VmArch + 'static, Vcpu: VcpuArch + 'static>( mut linux: RunnableLinuxVm<V, Vcpu>, sys_allocator: SystemAllocator, cfg: Config, control_server_socket: Option<UnlinkUnixSeqpacketListener>, all_control_tubes: Vec<AnyControlTube>, #[cfg(feature = "usb")] usb_control_tube: Tube, vm_evt_rdtube: RecvTube, vm_evt_wrtube: SendTube, sigchld_fd: SignalFd, gralloc: RutabagaGralloc, vcpu_ids: Vec<usize>, iommu_host_tube: Option<Tube>, #[cfg(target_arch = "x86_64")] hp_control_tube: mpsc::Sender<PciRootCommand>, #[cfg(target_arch = "x86_64")] hp_thread: std::thread::JoinHandle<()>, #[cfg(feature = "pci-hotplug")] mut hotplug_manager: Option<PciHotPlugManager>, #[allow(unused_mut)] #[cfg(feature = "swap")] mut swap_controller: Option<SwapController>, #[cfg(feature = "registered_events")] reg_evt_rdtube: RecvTube, guest_suspended_cvar: Option<Arc<(Mutex<bool>, Condvar)>>, metrics_tube: RecvTube, mut vfio_container_manager: VfioContainerManager, mut worker_process_pids: BTreeSet<Pid>, #[cfg(any(target_arch = "arm", target_arch = "aarch64"))] vcpu_domain_paths: BTreeMap< usize, PathBuf, >, ) -> Result<ExitState>3507 fn run_control<V: VmArch + 'static, Vcpu: VcpuArch + 'static>(
3508     mut linux: RunnableLinuxVm<V, Vcpu>,
3509     sys_allocator: SystemAllocator,
3510     cfg: Config,
3511     control_server_socket: Option<UnlinkUnixSeqpacketListener>,
3512     all_control_tubes: Vec<AnyControlTube>,
3513     #[cfg(feature = "usb")] usb_control_tube: Tube,
3514     vm_evt_rdtube: RecvTube,
3515     vm_evt_wrtube: SendTube,
3516     sigchld_fd: SignalFd,
3517     gralloc: RutabagaGralloc,
3518     vcpu_ids: Vec<usize>,
3519     iommu_host_tube: Option<Tube>,
3520     #[cfg(target_arch = "x86_64")] hp_control_tube: mpsc::Sender<PciRootCommand>,
3521     #[cfg(target_arch = "x86_64")] hp_thread: std::thread::JoinHandle<()>,
3522     #[cfg(feature = "pci-hotplug")] mut hotplug_manager: Option<PciHotPlugManager>,
3523     #[allow(unused_mut)] // mut is required x86 only
3524     #[cfg(feature = "swap")]
3525     mut swap_controller: Option<SwapController>,
3526     #[cfg(feature = "registered_events")] reg_evt_rdtube: RecvTube,
3527     guest_suspended_cvar: Option<Arc<(Mutex<bool>, Condvar)>>,
3528     metrics_tube: RecvTube,
3529     mut vfio_container_manager: VfioContainerManager,
3530     // A set of PID of child processes whose clean exit is expected and can be ignored.
3531     mut worker_process_pids: BTreeSet<Pid>,
3532     #[cfg(any(target_arch = "arm", target_arch = "aarch64"))] vcpu_domain_paths: BTreeMap<
3533         usize,
3534         PathBuf,
3535     >,
3536 ) -> Result<ExitState> {
3537     // Split up `all_control_tubes`.
3538     #[cfg(feature = "balloon")]
3539     let mut balloon_host_tube = None;
3540     let mut disk_host_tubes = Vec::new();
3541     #[cfg(feature = "gpu")]
3542     let mut gpu_control_tube = None;
3543     #[cfg(feature = "pvclock")]
3544     let mut pvclock_host_tube = None;
3545     #[cfg(feature = "audio")]
3546     let mut snd_host_tubes = Vec::new();
3547     let mut irq_control_tubes = Vec::new();
3548     let mut vm_memory_control_tubes = Vec::new();
3549     let mut control_tubes = Vec::new();
3550     for t in all_control_tubes {
3551         match t {
3552             #[cfg(feature = "balloon")]
3553             AnyControlTube::DeviceControlTube(DeviceControlTube::Balloon(t)) => {
3554                 assert!(balloon_host_tube.is_none());
3555                 balloon_host_tube = Some(t)
3556             }
3557             AnyControlTube::DeviceControlTube(DeviceControlTube::Disk(t)) => {
3558                 disk_host_tubes.push(t)
3559             }
3560             #[cfg(feature = "gpu")]
3561             AnyControlTube::DeviceControlTube(DeviceControlTube::Gpu(t)) => {
3562                 assert!(gpu_control_tube.is_none());
3563                 gpu_control_tube = Some(t)
3564             }
3565             #[cfg(feature = "pvclock")]
3566             AnyControlTube::DeviceControlTube(DeviceControlTube::PvClock(t)) => {
3567                 assert!(pvclock_host_tube.is_none());
3568                 pvclock_host_tube = Some(Arc::new(t))
3569             }
3570             #[cfg(feature = "audio")]
3571             AnyControlTube::DeviceControlTube(DeviceControlTube::Snd(t)) => {
3572                 snd_host_tubes.push(t);
3573             }
3574             AnyControlTube::IrqTube(t) => irq_control_tubes.push(t),
3575             AnyControlTube::TaggedControlTube(t) => control_tubes.push(t),
3576             AnyControlTube::VmMemoryTube(t) => vm_memory_control_tubes.push(t),
3577         }
3578     }
3579 
3580     #[cfg(feature = "gdb")]
3581     let (to_gdb_channel, gdb) = if let Some(port) = cfg.gdb {
3582         // GDB needs a control socket to interrupt vcpus.
3583         let (gdb_host_tube, gdb_control_tube) = Tube::pair().context("failed to create tube")?;
3584         control_tubes.push(TaggedControlTube::Vm(gdb_host_tube));
3585         // Create a channel for GDB thread.
3586         let (to_gdb_channel, from_vcpu_channel) = mpsc::channel();
3587         (
3588             Some(to_gdb_channel),
3589             Some((port, gdb_control_tube, from_vcpu_channel)),
3590         )
3591     } else {
3592         (None, None)
3593     };
3594 
3595     #[derive(EventToken)]
3596     enum Token {
3597         VmEvent,
3598         Suspend,
3599         ChildSignal,
3600         VmControlServer,
3601         VmControl {
3602             id: usize,
3603         },
3604         #[cfg(feature = "registered_events")]
3605         RegisteredEvent,
3606         #[cfg(feature = "balloon")]
3607         BalloonTube,
3608     }
3609     stdin()
3610         .set_raw_mode()
3611         .expect("failed to set terminal raw mode");
3612 
3613     let sys_allocator_mutex = Arc::new(Mutex::new(sys_allocator));
3614     let iommu_host_tube = iommu_host_tube.map(|t| Arc::new(Mutex::new(t)));
3615 
3616     let wait_ctx = WaitContext::build_with(&[
3617         (&linux.suspend_tube.1, Token::Suspend),
3618         (&sigchld_fd, Token::ChildSignal),
3619         (&vm_evt_rdtube, Token::VmEvent),
3620         #[cfg(feature = "registered_events")]
3621         (&reg_evt_rdtube, Token::RegisteredEvent),
3622     ])
3623     .context("failed to build wait context")?;
3624 
3625     if let Some(socket_server) = &control_server_socket {
3626         wait_ctx
3627             .add(socket_server, Token::VmControlServer)
3628             .context("failed to add descriptor to wait context")?;
3629     }
3630     let mut control_tubes = BTreeMap::from_iter(control_tubes.into_iter().enumerate());
3631     let mut next_control_id = control_tubes.len();
3632     for (id, socket) in control_tubes.iter() {
3633         wait_ctx
3634             .add(socket.as_ref(), Token::VmControl { id: *id })
3635             .context("failed to add descriptor to wait context")?;
3636     }
3637 
3638     #[cfg(feature = "balloon")]
3639     let mut balloon_tube = balloon_host_tube
3640         .map(|tube| -> Result<BalloonTube> {
3641             wait_ctx
3642                 .add(&tube, Token::BalloonTube)
3643                 .context("failed to add descriptor to wait context")?;
3644             Ok(BalloonTube::new(tube))
3645         })
3646         .transpose()
3647         .context("failed to create balloon tube")?;
3648 
3649     if cfg.jail_config.is_some() {
3650         // Before starting VCPUs, in case we started with some capabilities, drop them all.
3651         drop_capabilities().context("failed to drop process capabilities")?;
3652     }
3653 
3654     let (device_ctrl_tube, device_ctrl_resp) = Tube::pair().context("failed to create tube")?;
3655     // Create devices thread, and restore if a restore file exists.
3656     linux.devices_thread = match create_devices_worker_thread(
3657         linux.vm.get_memory().clone(),
3658         linux.io_bus.clone(),
3659         linux.mmio_bus.clone(),
3660         device_ctrl_resp,
3661     ) {
3662         Ok(join_handle) => Some(join_handle),
3663         Err(e) => {
3664             return Err(anyhow!("Failed to start devices thread: {}", e));
3665         }
3666     };
3667 
3668     let mut vcpu_handles = Vec::with_capacity(linux.vcpu_count);
3669     let vcpu_thread_barrier = Arc::new(Barrier::new(linux.vcpu_count + 1));
3670 
3671     if !linux
3672         .vm
3673         .get_hypervisor()
3674         .check_capability(HypervisorCap::ImmediateExit)
3675     {
3676         return Err(anyhow!(
3677             "missing required hypervisor capability ImmediateExit"
3678         ));
3679     }
3680 
3681     vcpu::setup_vcpu_signal_handler()?;
3682 
3683     let vcpus: Vec<Option<_>> = match linux.vcpus.take() {
3684         Some(vec) => vec.into_iter().map(Some).collect(),
3685         None => iter::repeat_with(|| None).take(linux.vcpu_count).collect(),
3686     };
3687     // Enable core scheduling before creating vCPUs so that the cookie will be
3688     // shared by all vCPU threads.
3689     // TODO(b/199312402): Avoid enabling core scheduling for the crosvm process
3690     // itself for even better performance. Only vCPUs need the feature.
3691     if cfg.core_scheduling && cfg.per_vm_core_scheduling {
3692         if let Err(e) = enable_core_scheduling() {
3693             error!("Failed to enable core scheduling: {}", e);
3694         }
3695     }
3696 
3697     // The tasks file only exist on sysfs if CgroupV1 hierachies are enabled
3698     let vcpu_cgroup_tasks_file = match &cfg.vcpu_cgroup_path {
3699         None => None,
3700         Some(cgroup_path) => {
3701             // Move main process to cgroup_path
3702             match File::create(cgroup_path.join("tasks")) {
3703                 Ok(file) => Some(file),
3704                 Err(_) => {
3705                     info!(
3706                         "Unable to open tasks file in cgroup: {}, trying CgroupV2",
3707                         cgroup_path.display()
3708                     );
3709                     None
3710                 }
3711             }
3712         }
3713     };
3714 
3715     // vCPU freq domains are currently only supported with CgroupsV2.
3716     let mut vcpu_cgroup_v2_files: std::collections::BTreeMap<usize, File> = BTreeMap::new();
3717     #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
3718     for (vcpu_id, vcpu_domain_path) in vcpu_domain_paths.iter() {
3719         let vcpu_cgroup_v2_file = File::create(vcpu_domain_path.join("cgroup.threads"))
3720             .with_context(|| {
3721                 format!(
3722                     "failed to create vcpu-cgroup-path {}",
3723                     vcpu_domain_path.join("cgroup.threads").display(),
3724                 )
3725             })?;
3726         vcpu_cgroup_v2_files.insert(*vcpu_id, vcpu_cgroup_v2_file);
3727     }
3728 
3729     #[cfg(target_arch = "x86_64")]
3730     let bus_lock_ratelimit_ctrl: Arc<Mutex<Ratelimit>> = Arc::new(Mutex::new(Ratelimit::new()));
3731     #[cfg(target_arch = "x86_64")]
3732     if cfg.bus_lock_ratelimit > 0 {
3733         let bus_lock_ratelimit = cfg.bus_lock_ratelimit;
3734         if linux.vm.check_capability(VmCap::BusLockDetect) {
3735             info!("Hypervisor support bus lock detect");
3736             linux
3737                 .vm
3738                 .enable_capability(VmCap::BusLockDetect, 0)
3739                 .expect("kvm: Failed to enable bus lock detection cap");
3740             info!("Hypervisor enabled bus lock detect");
3741             bus_lock_ratelimit_ctrl
3742                 .lock()
3743                 .ratelimit_set_speed(bus_lock_ratelimit);
3744         } else {
3745             bail!("Kvm: bus lock detection unsuported");
3746         }
3747     }
3748 
3749     #[cfg(target_os = "android")]
3750     android::set_process_profiles(&cfg.task_profiles)?;
3751 
3752     #[allow(unused_mut)]
3753     let mut run_mode = if cfg.suspended {
3754         // Sleep devices before creating vcpus.
3755         device_ctrl_tube
3756             .send(&DeviceControlCommand::SleepDevices)
3757             .context("send command to devices control socket")?;
3758         match device_ctrl_tube
3759             .recv()
3760             .context("receive from devices control socket")?
3761         {
3762             VmResponse::Ok => (),
3763             resp => bail!("device sleep failed: {}", resp),
3764         }
3765         VmRunMode::Suspending
3766     } else {
3767         VmRunMode::Running
3768     };
3769     #[cfg(feature = "gdb")]
3770     if to_gdb_channel.is_some() {
3771         // Wait until a GDB client attaches
3772         run_mode = VmRunMode::Breakpoint;
3773     }
3774     // If we are restoring from a snapshot, then start suspended.
3775     let (run_mode, post_restore_run_mode) = if cfg.restore_path.is_some() {
3776         (VmRunMode::Suspending, run_mode)
3777     } else {
3778         (run_mode, run_mode)
3779     };
3780 
3781     // Architecture-specific code must supply a vcpu_init element for each VCPU.
3782     assert_eq!(vcpus.len(), linux.vcpu_init.len());
3783 
3784     let (vcpu_pid_tid_sender, vcpu_pid_tid_receiver) = mpsc::channel();
3785     for ((cpu_id, vcpu), vcpu_init) in vcpus.into_iter().enumerate().zip(linux.vcpu_init.drain(..))
3786     {
3787         let vcpu_cgroup_file: Option<File>;
3788         if let Some(cgroup_file) = &vcpu_cgroup_tasks_file {
3789             vcpu_cgroup_file = Some(cgroup_file.try_clone().unwrap())
3790         } else if !cfg.cpu_freq_domains.is_empty() {
3791             vcpu_cgroup_file = Some(
3792                 (vcpu_cgroup_v2_files.remove(&cpu_id).unwrap())
3793                     .try_clone()
3794                     .unwrap(),
3795             )
3796         } else {
3797             vcpu_cgroup_file = None
3798         };
3799 
3800         let (to_vcpu_channel, from_main_channel) = mpsc::channel();
3801         let vcpu_affinity = match linux.vcpu_affinity.clone() {
3802             Some(VcpuAffinity::Global(v)) => v,
3803             Some(VcpuAffinity::PerVcpu(mut m)) => m.remove(&cpu_id).unwrap_or_default(),
3804             None => Default::default(),
3805         };
3806 
3807         #[cfg(target_arch = "x86_64")]
3808         let vcpu_hybrid_type = if !cfg.vcpu_hybrid_type.is_empty() {
3809             Some(*cfg.vcpu_hybrid_type.get(&cpu_id).unwrap())
3810         } else {
3811             None
3812         };
3813 
3814         #[cfg(target_arch = "x86_64")]
3815         let cpu_config = Some(CpuConfigX86_64::new(
3816             cfg.force_calibrated_tsc_leaf,
3817             cfg.host_cpu_topology,
3818             cfg.enable_hwp,
3819             cfg.no_smt,
3820             cfg.itmt,
3821             vcpu_hybrid_type,
3822         ));
3823         #[cfg(target_arch = "x86_64")]
3824         let bus_lock_ratelimit_ctrl = Arc::clone(&bus_lock_ratelimit_ctrl);
3825 
3826         #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
3827         let cpu_config = None;
3828 
3829         #[cfg(target_arch = "riscv64")]
3830         let cpu_config = Some(CpuConfigRiscv64::new(vcpu_init.fdt_address));
3831 
3832         let handle = vcpu::run_vcpu(
3833             cpu_id,
3834             vcpu_ids[cpu_id],
3835             vcpu,
3836             vcpu_init,
3837             linux.vm.try_clone().context("failed to clone vm")?,
3838             linux
3839                 .irq_chip
3840                 .try_box_clone()
3841                 .context("failed to clone irqchip")?,
3842             linux.vcpu_count,
3843             linux.rt_cpus.contains(&cpu_id),
3844             vcpu_affinity,
3845             linux.delay_rt,
3846             vcpu_thread_barrier.clone(),
3847             (*linux.io_bus).clone(),
3848             (*linux.mmio_bus).clone(),
3849             vm_evt_wrtube
3850                 .try_clone()
3851                 .context("failed to clone vm event tube")?,
3852             from_main_channel,
3853             #[cfg(feature = "gdb")]
3854             to_gdb_channel.clone(),
3855             cfg.core_scheduling,
3856             cfg.per_vm_core_scheduling,
3857             cpu_config,
3858             match vcpu_cgroup_file {
3859                 None => None,
3860                 Some(ref f) => Some(
3861                     f.try_clone()
3862                         .context("failed to clone vcpu cgroup tasks file")?,
3863                 ),
3864             },
3865             #[cfg(target_arch = "x86_64")]
3866             bus_lock_ratelimit_ctrl,
3867             run_mode,
3868             cfg.boost_uclamp,
3869             vcpu_pid_tid_sender.clone(),
3870         )?;
3871         vcpu_handles.push((handle, to_vcpu_channel));
3872     }
3873 
3874     let mut vcpus_pid_tid = BTreeMap::new();
3875     for _ in 0..vcpu_handles.len() {
3876         let vcpu_pid_tid: VcpuPidTid = vcpu_pid_tid_receiver
3877             .recv()
3878             .context("failed receiving vcpu pid/tid")?;
3879         if vcpus_pid_tid
3880             .insert(
3881                 vcpu_pid_tid.vcpu_id,
3882                 (vcpu_pid_tid.process_id, vcpu_pid_tid.thread_id),
3883             )
3884             .is_some()
3885         {
3886             return Err(anyhow!(
3887                 "Vcpu {} returned more than 1 PID and TID",
3888                 vcpu_pid_tid.vcpu_id
3889             ));
3890         }
3891     }
3892 
3893     #[cfg(feature = "gdb")]
3894     // Spawn GDB thread.
3895     if let Some((gdb_port_num, gdb_control_tube, from_vcpu_channel)) = gdb {
3896         let to_vcpu_channels = vcpu_handles
3897             .iter()
3898             .map(|(_handle, channel)| channel.clone())
3899             .collect();
3900         let target = GdbStub::new(gdb_control_tube, to_vcpu_channels, from_vcpu_channel);
3901         std::thread::Builder::new()
3902             .name("gdb".to_owned())
3903             .spawn(move || gdb_thread(target, gdb_port_num))
3904             .context("failed to spawn GDB thread")?;
3905     };
3906 
3907     let (irq_handler_control, irq_handler_control_for_thread) = Tube::pair()?;
3908     let sys_allocator_for_thread = sys_allocator_mutex.clone();
3909     let irq_chip_for_thread = linux.irq_chip.try_box_clone()?;
3910     let irq_handler_thread = std::thread::Builder::new()
3911         .name("irq_handler_thread".into())
3912         .spawn(move || {
3913             irq_handler_thread(
3914                 irq_control_tubes,
3915                 irq_chip_for_thread,
3916                 sys_allocator_for_thread,
3917                 irq_handler_control_for_thread,
3918             )
3919         })
3920         .unwrap();
3921 
3922     let (vm_memory_handler_control, vm_memory_handler_control_for_thread) = Tube::pair()?;
3923     let vm_memory_handler_thread = std::thread::Builder::new()
3924         .name("vm_memory_handler_thread".into())
3925         .spawn({
3926             let vm = linux.vm.try_clone().context("failed to clone Vm")?;
3927             let sys_allocator_mutex = sys_allocator_mutex.clone();
3928             let iommu_client = iommu_host_tube
3929                 .as_ref()
3930                 .map(|t| VmMemoryRequestIommuClient::new(t.clone()));
3931             move || {
3932                 vm_memory_handler_thread(
3933                     vm_memory_control_tubes,
3934                     vm,
3935                     sys_allocator_mutex,
3936                     gralloc,
3937                     iommu_client,
3938                     vm_memory_handler_control_for_thread,
3939                 )
3940             }
3941         })
3942         .unwrap();
3943 
3944     vcpu_thread_barrier.wait();
3945 
3946     // See comment on `VmRequest::execute`.
3947     let mut suspended_pvclock_state: Option<hypervisor::ClockState> = None;
3948 
3949     // Restore VM (if applicable).
3950     // Must happen after the vCPU barrier to avoid deadlock.
3951     if let Some(path) = &cfg.restore_path {
3952         vm_control::do_restore(
3953             path,
3954             |msg| vcpu::kick_all_vcpus(&vcpu_handles, linux.irq_chip.as_irq_chip(), msg),
3955             |msg, index| {
3956                 vcpu::kick_vcpu(&vcpu_handles.get(index), linux.irq_chip.as_irq_chip(), msg)
3957             },
3958             &irq_handler_control,
3959             &device_ctrl_tube,
3960             linux.vcpu_count,
3961             |image| {
3962                 linux
3963                     .irq_chip
3964                     .try_box_clone()?
3965                     .restore(image, linux.vcpu_count)
3966             },
3967             /* require_encrypted= */ false,
3968             &mut suspended_pvclock_state,
3969             &linux.vm,
3970         )?;
3971         // Allow the vCPUs to start for real.
3972         vcpu::kick_all_vcpus(
3973             &vcpu_handles,
3974             linux.irq_chip.as_irq_chip(),
3975             VcpuControl::RunState(post_restore_run_mode),
3976         )
3977     }
3978 
3979     #[cfg(feature = "swap")]
3980     if let Some(swap_controller) = &swap_controller {
3981         swap_controller
3982             .on_static_devices_setup_complete()
3983             .context("static device setup complete")?;
3984     }
3985 
3986     let metrics_thread = if metrics::is_initialized() {
3987         Some(
3988             std::thread::Builder::new()
3989                 .name("metrics_thread".into())
3990                 .spawn(move || {
3991                     if let Err(e) = MetricsController::new(vec![metrics_tube]).run() {
3992                         error!("Metrics controller error: {:?}", e);
3993                     }
3994                 })
3995                 .context("metrics thread failed")?,
3996         )
3997     } else {
3998         None
3999     };
4000 
4001     let mut exit_state = ExitState::Stop;
4002     let mut pvpanic_code = PvPanicCode::Unknown;
4003     #[cfg(feature = "registered_events")]
4004     let mut registered_evt_tubes: HashMap<RegisteredEvent, HashSet<AddressedProtoTube>> =
4005         HashMap::new();
4006 
4007     'wait: loop {
4008         let events = {
4009             match wait_ctx.wait() {
4010                 Ok(v) => v,
4011                 Err(e) => {
4012                     error!("failed to poll: {}", e);
4013                     break;
4014                 }
4015             }
4016         };
4017 
4018         let mut vm_control_ids_to_remove = Vec::new();
4019         for event in events.iter().filter(|e| e.is_readable) {
4020             match event.token {
4021                 #[cfg(feature = "registered_events")]
4022                 Token::RegisteredEvent => match reg_evt_rdtube.recv::<RegisteredEventWithData>() {
4023                     Ok(reg_evt) => {
4024                         let evt = reg_evt.into_event();
4025                         let mut tubes_to_remove: Vec<String> = Vec::new();
4026                         if let Some(tubes) = registered_evt_tubes.get_mut(&evt) {
4027                             for tube in tubes.iter() {
4028                                 if let Err(e) = tube.send(&reg_evt.into_proto()) {
4029                                     warn!(
4030                                         "failed to send registered event {:?} to {}, removing from \
4031                                          registrations: {}",
4032                                         reg_evt, tube.socket_addr, e
4033                                     );
4034                                     tubes_to_remove.push(tube.socket_addr.clone());
4035                                 }
4036                             }
4037                         }
4038                         for tube_addr in tubes_to_remove {
4039                             for tubes in registered_evt_tubes.values_mut() {
4040                                 tubes.retain(|t| t.socket_addr != tube_addr);
4041                             }
4042                         }
4043                         registered_evt_tubes.retain(|_, tubes| !tubes.is_empty());
4044                     }
4045                     Err(e) => {
4046                         warn!("failed to recv RegisteredEvent: {}", e);
4047                     }
4048                 },
4049                 Token::VmEvent => {
4050                     let mut break_to_wait: bool = true;
4051                     match vm_evt_rdtube.recv::<VmEventType>() {
4052                         Ok(vm_event) => match vm_event {
4053                             VmEventType::Exit => {
4054                                 info!("vcpu requested shutdown");
4055                                 exit_state = ExitState::Stop;
4056                             }
4057                             VmEventType::Reset => {
4058                                 info!("vcpu requested reset");
4059                                 exit_state = ExitState::Reset;
4060                             }
4061                             VmEventType::Crash => {
4062                                 info!("vcpu crashed");
4063                                 exit_state = ExitState::Crash;
4064                             }
4065                             VmEventType::Panic(panic_code) => {
4066                                 pvpanic_code = PvPanicCode::from_u8(panic_code);
4067                                 info!("Guest reported panic [Code: {}]", pvpanic_code);
4068                                 break_to_wait = false;
4069                             }
4070                             VmEventType::WatchdogReset => {
4071                                 info!("vcpu stall detected");
4072                                 exit_state = ExitState::WatchdogReset;
4073                             }
4074                         },
4075                         Err(e) => {
4076                             warn!("failed to recv VmEvent: {}", e);
4077                         }
4078                     }
4079                     if break_to_wait {
4080                         if pvpanic_code == PvPanicCode::Panicked {
4081                             exit_state = ExitState::GuestPanic;
4082                         }
4083                         break 'wait;
4084                     }
4085                 }
4086                 Token::Suspend => match linux.suspend_tube.1.recv::<bool>() {
4087                     Ok(is_suspend_request) => {
4088                         let mode = if is_suspend_request {
4089                             VmRunMode::Suspending
4090                         } else {
4091                             for dev in &linux.resume_notify_devices {
4092                                 dev.lock().resume_imminent();
4093                             }
4094                             VmRunMode::Running
4095                         };
4096                         info!("VM requested {}", mode);
4097                         vcpu::kick_all_vcpus(
4098                             &vcpu_handles,
4099                             linux.irq_chip.as_irq_chip(),
4100                             VcpuControl::RunState(mode),
4101                         );
4102                     }
4103                     Err(err) => {
4104                         warn!("Failed to read suspend tube {:?}", err);
4105                     }
4106                 },
4107                 Token::ChildSignal => {
4108                     // Print all available siginfo structs, then exit the loop if child process has
4109                     // been exited except CLD_STOPPED and CLD_CONTINUED. the two should be ignored
4110                     // here since they are used by the vmm-swap feature.
4111                     let mut do_exit = false;
4112                     while let Some(siginfo) =
4113                         sigchld_fd.read().context("failed to read signalfd")?
4114                     {
4115                         let pid = siginfo.ssi_pid;
4116                         let pid_label = match linux.pid_debug_label_map.get(&pid) {
4117                             Some(label) => format!("{} (pid {})", label, pid),
4118                             None => format!("pid {}", pid),
4119                         };
4120 
4121                         // TODO(kawasin): this is a temporary exception until device suspension.
4122                         #[cfg(feature = "swap")]
4123                         if siginfo.ssi_code == libc::CLD_STOPPED
4124                             || siginfo.ssi_code == libc::CLD_CONTINUED
4125                         {
4126                             continue;
4127                         }
4128 
4129                         // Ignore clean exits of non-tracked child processes when running without
4130                         // sandboxing. The virtio gpu process launches a render server for
4131                         // pass-through graphics. Host GPU drivers have been observed to fork
4132                         // child processes that exit cleanly which should not be considered a
4133                         // crash. When running with sandboxing, this should be handled by the
4134                         // device's process handler.
4135                         if cfg.jail_config.is_none()
4136                             && !linux.pid_debug_label_map.contains_key(&pid)
4137                             && siginfo.ssi_signo == libc::SIGCHLD as u32
4138                             && siginfo.ssi_code == libc::CLD_EXITED
4139                             && siginfo.ssi_status == 0
4140                         {
4141                             continue;
4142                         }
4143 
4144                         // Allow clean exits of a child process in `worker_process_pids`.
4145                         if siginfo.ssi_signo == libc::SIGCHLD as u32
4146                             && siginfo.ssi_code == libc::CLD_EXITED
4147                             && siginfo.ssi_status == 0
4148                             && worker_process_pids.remove(&(pid as Pid))
4149                         {
4150                             info!("child {pid} exited successfully");
4151                             continue;
4152                         }
4153 
4154                         error!(
4155                             "child {} exited: signo {}, status {}, code {}",
4156                             pid_label, siginfo.ssi_signo, siginfo.ssi_status, siginfo.ssi_code
4157                         );
4158                         do_exit = true;
4159                     }
4160                     if do_exit {
4161                         exit_state = ExitState::Crash;
4162                         break 'wait;
4163                     }
4164                 }
4165                 Token::VmControlServer => {
4166                     if let Some(socket_server) = &control_server_socket {
4167                         match socket_server.accept() {
4168                             Ok(socket) => {
4169                                 let id = next_control_id;
4170                                 next_control_id += 1;
4171                                 wait_ctx
4172                                     .add(&socket, Token::VmControl { id })
4173                                     .context("failed to add descriptor to wait context")?;
4174                                 control_tubes
4175                                     .insert(id, TaggedControlTube::Vm(Tube::try_from(socket)?));
4176                             }
4177                             Err(e) => error!("failed to accept socket: {}", e),
4178                         }
4179                     }
4180                 }
4181                 Token::VmControl { id } => {
4182                     if let Some(socket) = control_tubes.get(&id) {
4183                         let mut state = ControlLoopState {
4184                             linux: &mut linux,
4185                             cfg: &cfg,
4186                             sys_allocator: &sys_allocator_mutex,
4187                             control_tubes: &control_tubes,
4188                             disk_host_tubes: &disk_host_tubes[..],
4189                             #[cfg(feature = "audio")]
4190                             snd_host_tubes: &snd_host_tubes[..],
4191                             #[cfg(feature = "gpu")]
4192                             gpu_control_tube: gpu_control_tube.as_ref(),
4193                             #[cfg(feature = "usb")]
4194                             usb_control_tube: &usb_control_tube,
4195                             #[cfg(target_arch = "x86_64")]
4196                             iommu_host_tube: &iommu_host_tube,
4197                             #[cfg(target_arch = "x86_64")]
4198                             hp_control_tube: &hp_control_tube,
4199                             guest_suspended_cvar: &guest_suspended_cvar,
4200                             #[cfg(feature = "pci-hotplug")]
4201                             hotplug_manager: &mut hotplug_manager,
4202                             #[cfg(feature = "swap")]
4203                             swap_controller: &mut swap_controller,
4204                             vcpu_handles: &vcpu_handles,
4205                             #[cfg(feature = "balloon")]
4206                             balloon_tube: balloon_tube.as_mut(),
4207                             device_ctrl_tube: &device_ctrl_tube,
4208                             irq_handler_control: &irq_handler_control,
4209                             #[cfg(any(target_arch = "x86_64", feature = "pci-hotplug"))]
4210                             vm_memory_handler_control: &vm_memory_handler_control,
4211                             #[cfg(feature = "registered_events")]
4212                             registered_evt_tubes: &mut registered_evt_tubes,
4213                             #[cfg(feature = "pvclock")]
4214                             pvclock_host_tube: pvclock_host_tube.clone(),
4215                             vfio_container_manager: &mut vfio_container_manager,
4216                             suspended_pvclock_state: &mut suspended_pvclock_state,
4217                             vcpus_pid_tid: &vcpus_pid_tid,
4218                         };
4219                         let (exit_requested, mut ids_to_remove, add_tubes) =
4220                             process_vm_control_event(&mut state, id, socket)?;
4221                         if exit_requested {
4222                             break 'wait;
4223                         }
4224                         vm_control_ids_to_remove.append(&mut ids_to_remove);
4225                         for socket in add_tubes {
4226                             let id = next_control_id;
4227                             next_control_id += 1;
4228                             wait_ctx
4229                                 .add(socket.as_ref(), Token::VmControl { id })
4230                                 .context(
4231                                     "failed to add hotplug vfio-pci descriptor to wait context",
4232                                 )?;
4233                             control_tubes.insert(id, socket);
4234                         }
4235                     }
4236                 }
4237                 #[cfg(feature = "balloon")]
4238                 Token::BalloonTube => {
4239                     match balloon_tube.as_mut().expect("missing balloon tube").recv() {
4240                         Ok(resp) => {
4241                             for (resp, idx) in resp {
4242                                 if let Some(TaggedControlTube::Vm(tube)) = control_tubes.get(&idx) {
4243                                     if let Err(e) = tube.send(&resp) {
4244                                         error!("failed to send VmResponse: {}", e);
4245                                     }
4246                                 } else {
4247                                     error!("Bad tube index {}", idx);
4248                                 }
4249                             }
4250                         }
4251                         Err(err) => {
4252                             error!("Error processing balloon tube {:?}", err)
4253                         }
4254                     }
4255                 }
4256             }
4257         }
4258 
4259         remove_hungup_and_drained_tubes(
4260             &events,
4261             &wait_ctx,
4262             &mut control_tubes,
4263             vm_control_ids_to_remove,
4264             |token: &Token| {
4265                 if let Token::VmControl { id } = token {
4266                     return Some(*id);
4267                 }
4268                 None
4269             },
4270         )?;
4271     }
4272 
4273     vcpu::kick_all_vcpus(
4274         &vcpu_handles,
4275         linux.irq_chip.as_irq_chip(),
4276         VcpuControl::RunState(VmRunMode::Exiting),
4277     );
4278     for (handle, _) in vcpu_handles {
4279         if let Err(e) = handle.join() {
4280             error!("failed to join vcpu thread: {:?}", e);
4281         }
4282     }
4283 
4284     // After joining all vcpu threads, unregister the process-wide signal handler.
4285     if let Err(e) = vcpu::remove_vcpu_signal_handler() {
4286         error!("failed to remove vcpu thread signal handler: {:#}", e);
4287     }
4288 
4289     // Stop the vmm-swap monitor process.
4290     #[cfg(feature = "swap")]
4291     drop(swap_controller);
4292 
4293     // Stop pci root worker thread
4294     #[cfg(target_arch = "x86_64")]
4295     {
4296         let _ = hp_control_tube.send(PciRootCommand::Kill);
4297         if let Err(e) = hp_thread.join() {
4298             error!("failed to join hotplug thread: {:?}", e);
4299         }
4300     }
4301 
4302     if linux.devices_thread.is_some() {
4303         if let Err(e) = device_ctrl_tube.send(&DeviceControlCommand::Exit) {
4304             error!("failed to stop device control loop: {}", e);
4305         };
4306         if let Some(thread) = linux.devices_thread.take() {
4307             if let Err(e) = thread.join() {
4308                 error!("failed to exit devices thread: {:?}", e);
4309             }
4310         }
4311     }
4312 
4313     // Shut down the VM Memory handler thread.
4314     if let Err(e) = vm_memory_handler_control.send(&VmMemoryHandlerRequest::Exit) {
4315         error!(
4316             "failed to request exit from VM Memory handler thread: {}",
4317             e
4318         );
4319     }
4320     if let Err(e) = vm_memory_handler_thread.join() {
4321         error!("failed to exit VM Memory handler thread: {:?}", e);
4322     }
4323 
4324     // Shut down the IRQ handler thread.
4325     if let Err(e) = irq_handler_control.send(&IrqHandlerRequest::Exit) {
4326         error!("failed to request exit from IRQ handler thread: {}", e);
4327     }
4328     if let Err(e) = irq_handler_thread.join() {
4329         error!("failed to exit irq handler thread: {:?}", e);
4330     }
4331 
4332     // At this point, the only remaining `Arc` references to the `Bus` objects should be the ones
4333     // inside `linux`. If the checks below fail, then some other thread is probably still running
4334     // and needs to be explicitly stopped before dropping `linux` to ensure devices actually get
4335     // cleaned up.
4336     match Arc::try_unwrap(std::mem::replace(
4337         &mut linux.mmio_bus,
4338         Arc::new(Bus::new(BusType::Mmio)),
4339     )) {
4340         Ok(_) => {}
4341         Err(_) => panic!("internal error: mmio_bus had more than one reference at shutdown"),
4342     }
4343     match Arc::try_unwrap(std::mem::replace(
4344         &mut linux.io_bus,
4345         Arc::new(Bus::new(BusType::Io)),
4346     )) {
4347         Ok(_) => {}
4348         Err(_) => panic!("internal error: io_bus had more than one reference at shutdown"),
4349     }
4350 
4351     // Explicitly drop the VM structure here to allow the devices to clean up before the
4352     // control sockets are closed when this function exits.
4353     mem::drop(linux);
4354 
4355     // Drop the hotplug manager to tell the warden process to exit before we try to join
4356     // the metrics thread.
4357     #[cfg(feature = "pci-hotplug")]
4358     mem::drop(hotplug_manager);
4359 
4360     // All our children should have exited by now, so closing our fd should
4361     // terminate metrics. Then join so that everything gets flushed.
4362     metrics::get_destructor().cleanup();
4363     if let Some(metrics_thread) = metrics_thread {
4364         if let Err(e) = metrics_thread.join() {
4365             error!("failed to exit irq handler thread: {:?}", e);
4366         }
4367     }
4368 
4369     stdin()
4370         .set_canon_mode()
4371         .expect("failed to restore canonical mode for terminal");
4372 
4373     Ok(exit_state)
4374 }
4375 
4376 #[derive(EventToken)]
4377 enum IrqHandlerToken {
4378     IrqFd { index: IrqEventIndex },
4379     VmIrq { id: usize },
4380     DelayedIrqFd,
4381     HandlerControl,
4382 }
4383 
4384 /// Handles IRQs and requests from devices to add additional IRQ lines.
irq_handler_thread( irq_control_tubes: Vec<Tube>, mut irq_chip: Box<dyn IrqChipArch + 'static>, sys_allocator_mutex: Arc<Mutex<SystemAllocator>>, handler_control: Tube, ) -> anyhow::Result<()>4385 fn irq_handler_thread(
4386     irq_control_tubes: Vec<Tube>,
4387     mut irq_chip: Box<dyn IrqChipArch + 'static>,
4388     sys_allocator_mutex: Arc<Mutex<SystemAllocator>>,
4389     handler_control: Tube,
4390 ) -> anyhow::Result<()> {
4391     let wait_ctx = WaitContext::build_with(&[(
4392         handler_control.get_read_notifier(),
4393         IrqHandlerToken::HandlerControl,
4394     )])
4395     .context("failed to build wait context")?;
4396 
4397     if let Some(delayed_ioapic_irq_trigger) = irq_chip.irq_delayed_event_token()? {
4398         wait_ctx
4399             .add(&delayed_ioapic_irq_trigger, IrqHandlerToken::DelayedIrqFd)
4400             .context("failed to add descriptor to wait context")?;
4401     }
4402 
4403     let mut irq_event_tokens = irq_chip
4404         .irq_event_tokens()
4405         .context("failed get event tokens from irqchip")?;
4406 
4407     for (index, _gsi, evt) in irq_event_tokens.iter() {
4408         wait_ctx
4409             .add(evt, IrqHandlerToken::IrqFd { index: *index })
4410             .context("failed to add irq chip event tokens to wait context")?;
4411     }
4412 
4413     let mut irq_control_tubes = BTreeMap::from_iter(irq_control_tubes.into_iter().enumerate());
4414     let mut next_control_id = irq_control_tubes.len();
4415     for (id, socket) in irq_control_tubes.iter() {
4416         wait_ctx
4417             .add(
4418                 socket.get_read_notifier(),
4419                 IrqHandlerToken::VmIrq { id: *id },
4420             )
4421             .context("irq control tubes to wait context")?;
4422     }
4423 
4424     'wait: loop {
4425         let events = {
4426             match wait_ctx.wait() {
4427                 Ok(v) => v,
4428                 Err(e) => {
4429                     error!("failed to poll: {}", e);
4430                     break 'wait;
4431                 }
4432             }
4433         };
4434         let token_count = events.len();
4435         let mut vm_irq_tubes_to_remove = Vec::new();
4436         let mut notify_control_on_iteration_end = false;
4437 
4438         for event in events.iter().filter(|e| e.is_readable) {
4439             match event.token {
4440                 IrqHandlerToken::HandlerControl => {
4441                     match handler_control.recv::<IrqHandlerRequest>() {
4442                         Ok(request) => {
4443                             match request {
4444                                 IrqHandlerRequest::Exit => break 'wait,
4445                                 IrqHandlerRequest::AddIrqControlTubes(tubes) => {
4446                                     for socket in tubes {
4447                                         let id = next_control_id;
4448                                         next_control_id += 1;
4449                                         wait_ctx
4450                                         .add(
4451                                             socket.get_read_notifier(),
4452                                             IrqHandlerToken::VmIrq { id },
4453                                         )
4454                                         .context("failed to add new IRQ control Tube to wait context")?;
4455                                         irq_control_tubes.insert(id, socket);
4456                                     }
4457                                 }
4458                                 IrqHandlerRequest::RefreshIrqEventTokens => {
4459                                     for (_index, _gsi, evt) in irq_event_tokens.iter() {
4460                                         wait_ctx.delete(evt).context(
4461                                             "failed to remove irq chip event \
4462                                                 token from wait context",
4463                                         )?;
4464                                     }
4465 
4466                                     irq_event_tokens = irq_chip
4467                                         .irq_event_tokens()
4468                                         .context("failed get event tokens from irqchip")?;
4469                                     for (index, _gsi, evt) in irq_event_tokens.iter() {
4470                                         wait_ctx
4471                                             .add(evt, IrqHandlerToken::IrqFd { index: *index })
4472                                             .context(
4473                                                 "failed to add irq chip event \
4474                                                 tokens to wait context",
4475                                             )?;
4476                                     }
4477 
4478                                     if let Err(e) = handler_control
4479                                         .send(&IrqHandlerResponse::IrqEventTokenRefreshComplete)
4480                                     {
4481                                         error!(
4482                                             "failed to notify IRQ event token refresh \
4483                                             was completed: {}",
4484                                             e
4485                                         );
4486                                     }
4487                                 }
4488                                 IrqHandlerRequest::WakeAndNotifyIteration => {
4489                                     notify_control_on_iteration_end = true;
4490                                 }
4491                             }
4492                         }
4493                         Err(e) => {
4494                             if let TubeError::Disconnected = e {
4495                                 panic!("irq handler control tube disconnected.");
4496                             } else {
4497                                 error!("failed to recv IrqHandlerRequest: {}", e);
4498                             }
4499                         }
4500                     }
4501                 }
4502                 IrqHandlerToken::VmIrq { id } => {
4503                     if let Some(tube) = irq_control_tubes.get(&id) {
4504                         handle_irq_tube_request(
4505                             &sys_allocator_mutex,
4506                             &mut irq_chip,
4507                             &mut vm_irq_tubes_to_remove,
4508                             &wait_ctx,
4509                             tube,
4510                             id,
4511                         );
4512                     }
4513                 }
4514                 IrqHandlerToken::IrqFd { index } => {
4515                     if let Err(e) = irq_chip.service_irq_event(index) {
4516                         error!("failed to signal irq {}: {}", index, e);
4517                     }
4518                 }
4519                 IrqHandlerToken::DelayedIrqFd => {
4520                     if let Err(e) = irq_chip.process_delayed_irq_events() {
4521                         warn!("can't deliver delayed irqs: {}", e);
4522                     }
4523                 }
4524             }
4525         }
4526 
4527         if notify_control_on_iteration_end {
4528             if let Err(e) = handler_control.send(&IrqHandlerResponse::HandlerIterationComplete(
4529                 token_count - 1,
4530             )) {
4531                 error!(
4532                     "failed to notify on iteration completion (snapshotting may fail): {}",
4533                     e
4534                 );
4535             }
4536         }
4537 
4538         remove_hungup_and_drained_tubes(
4539             &events,
4540             &wait_ctx,
4541             &mut irq_control_tubes,
4542             vm_irq_tubes_to_remove,
4543             |token: &IrqHandlerToken| {
4544                 if let IrqHandlerToken::VmIrq { id } = token {
4545                     return Some(*id);
4546                 }
4547                 None
4548             },
4549         )?;
4550         if events.iter().any(|e| {
4551             e.is_hungup && !e.is_readable && matches!(e.token, IrqHandlerToken::HandlerControl)
4552         }) {
4553             error!("IRQ handler control hung up but did not request an exit.");
4554             break 'wait;
4555         }
4556     }
4557     Ok(())
4558 }
4559 
handle_irq_tube_request( sys_allocator_mutex: &Arc<Mutex<SystemAllocator>>, irq_chip: &mut Box<dyn IrqChipArch + 'static>, vm_irq_tubes_to_remove: &mut Vec<usize>, wait_ctx: &WaitContext<IrqHandlerToken>, tube: &Tube, tube_index: usize, )4560 fn handle_irq_tube_request(
4561     sys_allocator_mutex: &Arc<Mutex<SystemAllocator>>,
4562     irq_chip: &mut Box<dyn IrqChipArch + 'static>,
4563     vm_irq_tubes_to_remove: &mut Vec<usize>,
4564     wait_ctx: &WaitContext<IrqHandlerToken>,
4565     tube: &Tube,
4566     tube_index: usize,
4567 ) {
4568     match tube.recv::<VmIrqRequest>() {
4569         Ok(request) => {
4570             let response = {
4571                 request.execute(
4572                     |setup| match setup {
4573                         IrqSetup::Event(irq, ev, device_id, queue_id, device_name) => {
4574                             let irq_evt = devices::IrqEdgeEvent::from_event(ev.try_clone()?);
4575                             let source = IrqEventSource {
4576                                 device_id: device_id.try_into().expect("Invalid device_id"),
4577                                 queue_id,
4578                                 device_name,
4579                             };
4580                             if let Some(event_index) =
4581                                 irq_chip.register_edge_irq_event(irq, &irq_evt, source)?
4582                             {
4583                                 if let Err(e) =
4584                                     wait_ctx.add(ev, IrqHandlerToken::IrqFd { index: event_index })
4585                                 {
4586                                     warn!("failed to add IrqFd to poll context: {}", e);
4587                                     return Err(e);
4588                                 }
4589                             }
4590                             Ok(())
4591                         }
4592                         IrqSetup::Route(route) => irq_chip.route_irq(route),
4593                         IrqSetup::UnRegister(irq, ev) => {
4594                             let irq_evt = devices::IrqEdgeEvent::from_event(ev.try_clone()?);
4595                             irq_chip.unregister_edge_irq_event(irq, &irq_evt)
4596                         }
4597                     },
4598                     &mut sys_allocator_mutex.lock(),
4599                 )
4600             };
4601             if let Err(e) = tube.send(&response) {
4602                 error!("failed to send VmIrqResponse: {}", e);
4603             }
4604         }
4605         Err(e) => {
4606             if let TubeError::Disconnected = e {
4607                 vm_irq_tubes_to_remove.push(tube_index);
4608             } else {
4609                 error!("failed to recv VmIrqRequest: {}", e);
4610             }
4611         }
4612     }
4613 }
4614 
4615 /// Commands to control the VM Memory handler thread.
4616 #[derive(serde::Serialize, serde::Deserialize)]
4617 pub enum VmMemoryHandlerRequest {
4618     /// No response is sent for this command.
4619     AddControlTubes(Vec<VmMemoryTube>),
4620     /// No response is sent for this command.
4621     Exit,
4622 }
4623 
vm_memory_handler_thread( control_tubes: Vec<VmMemoryTube>, mut vm: impl Vm, sys_allocator_mutex: Arc<Mutex<SystemAllocator>>, mut gralloc: RutabagaGralloc, mut iommu_client: Option<VmMemoryRequestIommuClient>, handler_control: Tube, ) -> anyhow::Result<()>4624 fn vm_memory_handler_thread(
4625     control_tubes: Vec<VmMemoryTube>,
4626     mut vm: impl Vm,
4627     sys_allocator_mutex: Arc<Mutex<SystemAllocator>>,
4628     mut gralloc: RutabagaGralloc,
4629     mut iommu_client: Option<VmMemoryRequestIommuClient>,
4630     handler_control: Tube,
4631 ) -> anyhow::Result<()> {
4632     #[derive(EventToken)]
4633     enum Token {
4634         VmControl { id: usize },
4635         HandlerControl,
4636     }
4637 
4638     let wait_ctx =
4639         WaitContext::build_with(&[(handler_control.get_read_notifier(), Token::HandlerControl)])
4640             .context("failed to build wait context")?;
4641     let mut control_tubes = BTreeMap::from_iter(control_tubes.into_iter().enumerate());
4642     let mut next_control_id = control_tubes.len();
4643     for (id, socket) in control_tubes.iter() {
4644         wait_ctx
4645             .add(socket.as_ref(), Token::VmControl { id: *id })
4646             .context("failed to add descriptor to wait context")?;
4647     }
4648 
4649     let mut region_state: VmMemoryRegionState = Default::default();
4650 
4651     'wait: loop {
4652         let events = {
4653             match wait_ctx.wait() {
4654                 Ok(v) => v,
4655                 Err(e) => {
4656                     error!("failed to poll: {}", e);
4657                     break;
4658                 }
4659             }
4660         };
4661 
4662         let mut vm_control_ids_to_remove = Vec::new();
4663         for event in events.iter().filter(|e| e.is_readable) {
4664             match event.token {
4665                 Token::HandlerControl => match handler_control.recv::<VmMemoryHandlerRequest>() {
4666                     Ok(request) => match request {
4667                         VmMemoryHandlerRequest::Exit => break 'wait,
4668                         VmMemoryHandlerRequest::AddControlTubes(tubes) => {
4669                             for socket in tubes {
4670                                 let id = next_control_id;
4671                                 next_control_id += 1;
4672                                 wait_ctx
4673                                     .add(socket.get_read_notifier(), Token::VmControl { id })
4674                                     .context(
4675                                         "failed to add new vm memory control Tube to wait context",
4676                                     )?;
4677                                 control_tubes.insert(id, socket);
4678                             }
4679                         }
4680                     },
4681                     Err(e) => {
4682                         if let TubeError::Disconnected = e {
4683                             panic!("vm memory control tube disconnected.");
4684                         } else {
4685                             error!("failed to recv VmMemoryHandlerRequest: {}", e);
4686                         }
4687                     }
4688                 },
4689                 Token::VmControl { id } => {
4690                     if let Some(VmMemoryTube {
4691                         tube,
4692                         expose_with_viommu,
4693                     }) = control_tubes.get(&id)
4694                     {
4695                         match tube.recv::<VmMemoryRequest>() {
4696                             Ok(request) => {
4697                                 let response = request.execute(
4698                                     tube,
4699                                     &mut vm,
4700                                     &mut sys_allocator_mutex.lock(),
4701                                     &mut gralloc,
4702                                     if *expose_with_viommu {
4703                                         iommu_client.as_mut()
4704                                     } else {
4705                                         None
4706                                     },
4707                                     &mut region_state,
4708                                 );
4709                                 if let Err(e) = tube.send(&response) {
4710                                     error!("failed to send VmMemoryControlResponse: {}", e);
4711                                 }
4712                             }
4713                             Err(e) => {
4714                                 if let TubeError::Disconnected = e {
4715                                     vm_control_ids_to_remove.push(id);
4716                                 } else {
4717                                     error!("failed to recv VmMemoryControlRequest: {}", e);
4718                                 }
4719                             }
4720                         }
4721                     }
4722                 }
4723             }
4724         }
4725 
4726         remove_hungup_and_drained_tubes(
4727             &events,
4728             &wait_ctx,
4729             &mut control_tubes,
4730             vm_control_ids_to_remove,
4731             |token: &Token| {
4732                 if let Token::VmControl { id } = token {
4733                     return Some(*id);
4734                 }
4735                 None
4736             },
4737         )?;
4738         if events
4739             .iter()
4740             .any(|e| e.is_hungup && !e.is_readable && matches!(e.token, Token::HandlerControl))
4741         {
4742             error!("vm memory handler control hung up but did not request an exit.");
4743             break 'wait;
4744         }
4745     }
4746     Ok(())
4747 }
4748 
4749 /// When control tubes hang up, we want to make sure that we've fully drained
4750 /// the underlying socket before removing it. This function also handles
4751 /// removing closed sockets in such a way that avoids phantom events.
4752 ///
4753 /// `tube_ids_to_remove` is the set of ids that we already know should
4754 /// be removed (e.g. from getting a disconnect error on read).
remove_hungup_and_drained_tubes<T, U>( events: &SmallVec<[TriggeredEvent<T>; 16]>, wait_ctx: &WaitContext<T>, tubes: &mut BTreeMap<usize, U>, mut tube_ids_to_remove: Vec<usize>, get_tube_id: fn(token: &T) -> Option<usize>, ) -> anyhow::Result<()> where T: EventToken, U: ReadNotifier,4755 fn remove_hungup_and_drained_tubes<T, U>(
4756     events: &SmallVec<[TriggeredEvent<T>; 16]>,
4757     wait_ctx: &WaitContext<T>,
4758     tubes: &mut BTreeMap<usize, U>,
4759     mut tube_ids_to_remove: Vec<usize>,
4760     get_tube_id: fn(token: &T) -> Option<usize>,
4761 ) -> anyhow::Result<()>
4762 where
4763     T: EventToken,
4764     U: ReadNotifier,
4765 {
4766     // It's possible more data is readable and buffered while the socket is hungup,
4767     // so don't delete the tube from the poll context until we're sure all the
4768     // data is read.
4769     // Below case covers a condition where we have received a hungup event and the tube is not
4770     // readable.
4771     // In case of readable tube, once all data is read, any attempt to read more data on hungup
4772     // tube should fail. On such failure, we get Disconnected error and ids gets added to
4773     // tube_ids_to_remove by the time we reach here.
4774     for event in events.iter().filter(|e| e.is_hungup && !e.is_readable) {
4775         if let Some(id) = get_tube_id(&event.token) {
4776             tube_ids_to_remove.push(id);
4777         }
4778     }
4779 
4780     tube_ids_to_remove.dedup();
4781     for id in tube_ids_to_remove {
4782         // Delete the socket from the `wait_ctx` synchronously. Otherwise, the kernel will do
4783         // this automatically when the FD inserted into the `wait_ctx` is closed after this
4784         // if-block, but this removal can be deferred unpredictably. In some instances where the
4785         // system is under heavy load, we can even get events returned by `wait_ctx` for an FD
4786         // that has already been closed. Because the token associated with that spurious event
4787         // now belongs to a different socket, the control loop will start to interact with
4788         // sockets that might not be ready to use. This can cause incorrect hangup detection or
4789         // blocking on a socket that will never be ready. See also: crbug.com/1019986
4790         if let Some(socket) = tubes.remove(&id) {
4791             wait_ctx
4792                 .delete(socket.get_read_notifier())
4793                 .context("failed to remove descriptor from wait context")?;
4794         }
4795     }
4796     Ok(())
4797 }
4798 
4799 /// Start and jail a vhost-user device according to its configuration and a vhost listener string.
4800 ///
4801 /// The jailing business is nasty and potentially unsafe if done from the wrong context - do not
4802 /// call outside of `start_devices`!
4803 ///
4804 /// Returns the pid of the jailed device process.
jail_and_start_vu_device<T: VirtioDeviceBuilder>( jail_config: Option<&JailConfig>, params: T, vhost: &str, name: &str, ) -> anyhow::Result<(libc::pid_t, Option<Box<dyn std::any::Any>>)>4805 fn jail_and_start_vu_device<T: VirtioDeviceBuilder>(
4806     jail_config: Option<&JailConfig>,
4807     params: T,
4808     vhost: &str,
4809     name: &str,
4810 ) -> anyhow::Result<(libc::pid_t, Option<Box<dyn std::any::Any>>)> {
4811     let mut keep_rds = Vec::new();
4812 
4813     base::syslog::push_descriptors(&mut keep_rds);
4814     cros_tracing::push_descriptors!(&mut keep_rds);
4815     metrics::push_descriptors(&mut keep_rds);
4816 
4817     let jail_type = VirtioDeviceType::VhostUser;
4818 
4819     // Create a jail from the configuration. If the configuration is `None`, `create_jail` will also
4820     // return `None` so fall back to an empty (i.e. non-constrained) Minijail.
4821     let jail = params
4822         .create_jail(jail_config, jail_type)
4823         .with_context(|| format!("failed to create jail for {}", name))?
4824         .ok_or(())
4825         .or_else(|_| create_default_minijail())
4826         .with_context(|| format!("failed to create empty jail for {}", name))?;
4827 
4828     // Create the device in the parent process, so the child does not need any privileges necessary
4829     // to do it (only runtime capabilities are required).
4830     let device = params
4831         .create_vhost_user_device(&mut keep_rds)
4832         .context("failed to create vhost-user device")?;
4833     let mut listener =
4834         VhostUserListener::new(vhost).context("failed to create the vhost listener")?;
4835     keep_rds.push(listener.as_raw_descriptor());
4836     let parent_resources = listener.take_parent_process_resources();
4837 
4838     // Executor must be created before jail in order to prevent the jailed process from creating
4839     // unrestricted io_urings.
4840     let ex = Executor::new().context("Failed to create an Executor")?;
4841     keep_rds.extend(ex.as_raw_descriptors());
4842 
4843     // Deduplicate the FDs since minijail expects them to be unique.
4844     keep_rds.sort_unstable();
4845     keep_rds.dedup();
4846 
4847     // SAFETY:
4848     // Safe because we are keeping all the descriptors needed for the child to function.
4849     match unsafe { jail.fork(Some(&keep_rds)).context("error while forking")? } {
4850         0 => {
4851             // In the child process.
4852 
4853             // Free memory for the resources managed by the parent, without running drop() on them.
4854             // The parent will do it as we exit.
4855             let _ = std::mem::ManuallyDrop::new(parent_resources);
4856 
4857             // Make sure the child process does not survive its parent.
4858             // SAFETY: trivially safe
4859             if unsafe { libc::prctl(libc::PR_SET_PDEATHSIG, libc::SIGKILL) } < 0 {
4860                 panic!("call to prctl(PR_SET_DEATHSIG, SIGKILL) failed. Aborting child process.");
4861             }
4862 
4863             // Set the name for the thread.
4864             const MAX_LEN: usize = 15; // pthread_setname_np() limit on Linux
4865             let debug_label_trimmed = &name.as_bytes()[..std::cmp::min(MAX_LEN, name.len())];
4866             let thread_name = CString::new(debug_label_trimmed).unwrap();
4867             // SAFETY:
4868             // Safe because we trimmed the name to 15 characters (and pthread_setname_np will return
4869             // an error if we don't anyway).
4870             let _ = unsafe { libc::pthread_setname_np(libc::pthread_self(), thread_name.as_ptr()) };
4871 
4872             // Run the device loop and terminate the child process once it exits.
4873             let res = match listener.run_device(ex, device) {
4874                 Ok(()) => 0,
4875                 Err(e) => {
4876                     error!("error while running device {}: {:#}", name, e);
4877                     1
4878                 }
4879             };
4880             // SAFETY: trivially safe
4881             unsafe { libc::exit(res) };
4882         }
4883         pid => {
4884             // In the parent process. We will drop the device and listener when exiting this method.
4885             // This is fine as ownership for both has been transferred to the child process and they
4886             // will keep living there. We just retain `parent_resources` for things we are supposed
4887             // to clean up ourselves.
4888 
4889             info!("process for device {} (PID {}) started", &name, pid);
4890             #[cfg(feature = "seccomp_trace")]
4891             debug!(
4892                     "seccomp_trace {{\"event\": \"minijail_fork\", \"pid\": {}, \"name\": \"{}\", \"jail_addr\": \"0x{:x}\"}}",
4893                     pid,
4894                     &name,
4895                     read_jail_addr(&jail)
4896                 );
4897             Ok((pid, parent_resources))
4898         }
4899     }
4900 }
4901 
process_vhost_user_control_request(tube: Tube, disk_host_tubes: &[Tube]) -> Result<()>4902 fn process_vhost_user_control_request(tube: Tube, disk_host_tubes: &[Tube]) -> Result<()> {
4903     let command = tube
4904         .recv::<VmRequest>()
4905         .context("failed to receive VmRequest")?;
4906     let resp = match command {
4907         VmRequest::DiskCommand {
4908             disk_index,
4909             ref command,
4910         } => match &disk_host_tubes.get(disk_index) {
4911             Some(tube) => handle_disk_command(command, tube),
4912             None => VmResponse::Err(base::Error::new(libc::ENODEV)),
4913         },
4914         request => {
4915             error!(
4916                 "Request {:?} currently not supported in vhost user backend",
4917                 request
4918             );
4919             VmResponse::Err(base::Error::new(libc::EPERM))
4920         }
4921     };
4922 
4923     tube.send(&resp).context("failed to send VmResponse")?;
4924     Ok(())
4925 }
4926 
start_vhost_user_control_server( control_server_socket: UnlinkUnixSeqpacketListener, disk_host_tubes: Vec<Tube>, )4927 fn start_vhost_user_control_server(
4928     control_server_socket: UnlinkUnixSeqpacketListener,
4929     disk_host_tubes: Vec<Tube>,
4930 ) {
4931     info!("Start vhost-user control server");
4932     loop {
4933         match control_server_socket.accept() {
4934             Ok(socket) => {
4935                 let tube = match Tube::try_from(socket) {
4936                     Ok(tube) => tube,
4937                     Err(e) => {
4938                         error!("failed to open tube: {:#}", e);
4939                         return;
4940                     }
4941                 };
4942                 if let Err(e) = process_vhost_user_control_request(tube, &disk_host_tubes) {
4943                     error!("failed to process control request: {:#}", e);
4944                 }
4945             }
4946             Err(e) => {
4947                 error!("failed to establish connection: {}", e);
4948             }
4949         }
4950     }
4951 }
4952 
start_devices(opts: DevicesCommand) -> anyhow::Result<()>4953 pub fn start_devices(opts: DevicesCommand) -> anyhow::Result<()> {
4954     if let Some(async_executor) = opts.async_executor {
4955         Executor::set_default_executor_kind(async_executor)
4956             .context("Failed to set the default async executor")?;
4957     }
4958 
4959     struct DeviceJailInfo {
4960         // Unique name for the device, in the form `foomatic-0`.
4961         name: String,
4962         _drop_resources: Option<Box<dyn std::any::Any>>,
4963     }
4964 
4965     fn add_device<T: VirtioDeviceBuilder>(
4966         i: usize,
4967         device_params: T,
4968         vhost: &str,
4969         jail_config: Option<&JailConfig>,
4970         devices_jails: &mut BTreeMap<libc::pid_t, DeviceJailInfo>,
4971     ) -> anyhow::Result<()> {
4972         let name = format!("{}-{}", T::NAME, i);
4973 
4974         let (pid, _drop_resources) =
4975             jail_and_start_vu_device::<T>(jail_config, device_params, vhost, &name)?;
4976 
4977         devices_jails.insert(
4978             pid,
4979             DeviceJailInfo {
4980                 name,
4981                 _drop_resources,
4982             },
4983         );
4984 
4985         Ok(())
4986     }
4987 
4988     let mut devices_jails: BTreeMap<libc::pid_t, DeviceJailInfo> = BTreeMap::new();
4989 
4990     let jail = if opts.disable_sandbox {
4991         None
4992     } else {
4993         Some(&opts.jail)
4994     };
4995 
4996     // Create control server socket
4997     let control_server_socket = opts.control_socket.map(|path| {
4998         UnlinkUnixSeqpacketListener(
4999             UnixSeqpacketListener::bind(path).expect("Could not bind socket"),
5000         )
5001     });
5002 
5003     // Create serial devices.
5004     for (i, params) in opts.serial.iter().enumerate() {
5005         let serial_config = &params.device;
5006         add_device(i, serial_config, &params.vhost, jail, &mut devices_jails)?;
5007     }
5008 
5009     let mut disk_host_tubes = Vec::new();
5010     let control_socket_exists = control_server_socket.is_some();
5011     // Create block devices.
5012     for (i, params) in opts.block.iter().enumerate() {
5013         let tube = if control_socket_exists {
5014             let (host_tube, device_tube) = Tube::pair().context("failed to create tube")?;
5015             disk_host_tubes.push(host_tube);
5016             Some(device_tube)
5017         } else {
5018             None
5019         };
5020         let disk_config = DiskConfig::new(&params.device, tube);
5021         add_device(i, disk_config, &params.vhost, jail, &mut devices_jails)?;
5022     }
5023 
5024     // Create vsock devices.
5025     for (i, params) in opts.vsock.iter().enumerate() {
5026         add_device(i, &params.device, &params.vhost, jail, &mut devices_jails)?;
5027     }
5028 
5029     // Create network devices.
5030     #[cfg(feature = "net")]
5031     for (i, params) in opts.net.iter().enumerate() {
5032         add_device(i, &params.device, &params.vhost, jail, &mut devices_jails)?;
5033     }
5034 
5035     // No device created, that's probably not intended - print the help in that case.
5036     if devices_jails.is_empty() {
5037         let err = DevicesCommand::from_args(
5038             &[&std::env::args().next().unwrap_or(String::from("crosvm"))],
5039             &["--help"],
5040         )
5041         .unwrap_err();
5042         println!("{}", err.output);
5043         return Ok(());
5044     }
5045 
5046     let ex = Executor::new()?;
5047     if let Some(control_server_socket) = control_server_socket {
5048         // Start the control server in the parent process.
5049         ex.spawn_blocking(move || {
5050             start_vhost_user_control_server(control_server_socket, disk_host_tubes)
5051         })
5052         .detach();
5053     }
5054 
5055     // Now wait for all device processes to return.
5056     while !devices_jails.is_empty() {
5057         match base::linux::wait_for_pid(-1, 0) {
5058             Err(e) => panic!("error waiting for child process to complete: {:#}", e),
5059             Ok((Some(pid), wait_status)) => match devices_jails.remove_entry(&pid) {
5060                 Some((_, info)) => {
5061                     if let Some(status) = wait_status.code() {
5062                         info!(
5063                             "process for device {} (PID {}) exited with code {}",
5064                             &info.name, pid, status
5065                         );
5066                     } else if let Some(signal) = wait_status.signal() {
5067                         warn!(
5068                             "process for device {} (PID {}) has been killed by signal {:?}",
5069                             &info.name, pid, signal,
5070                         );
5071                     }
5072                 }
5073                 None => error!("pid {} is not one of our device processes", pid),
5074             },
5075             // `wait_for_pid` will necessarily return a PID because we asked to it wait for one to
5076             // complete.
5077             Ok((None, _)) => unreachable!(),
5078         }
5079     }
5080 
5081     info!("all device processes have exited");
5082 
5083     Ok(())
5084 }
5085 
5086 /// Setup crash reporting for a process. Each process MUST provide a unique `product_type` to avoid
5087 /// making crash reports incomprehensible.
5088 #[cfg(feature = "crash-report")]
setup_emulator_crash_reporting(_cfg: &Config) -> anyhow::Result<String>5089 pub fn setup_emulator_crash_reporting(_cfg: &Config) -> anyhow::Result<String> {
5090     crash_report::setup_crash_reporting(crash_report::CrashReportAttributes {
5091         product_type: "emulator".to_owned(),
5092         pipe_name: None,
5093         report_uuid: None,
5094         product_name: None,
5095         product_version: None,
5096     })
5097 }
5098 
5099 #[cfg(test)]
5100 mod tests {
5101     use std::path::PathBuf;
5102 
5103     use vm_memory::MemoryRegionPurpose;
5104 
5105     use super::*;
5106 
5107     // Create a file-backed mapping parameters struct with the given `address` and `size` and other
5108     // parameters set to default values.
test_file_backed_mapping(address: u64, size: u64) -> FileBackedMappingParameters5109     fn test_file_backed_mapping(address: u64, size: u64) -> FileBackedMappingParameters {
5110         FileBackedMappingParameters {
5111             address,
5112             size,
5113             path: PathBuf::new(),
5114             offset: 0,
5115             writable: false,
5116             sync: false,
5117             align: false,
5118             ram: true,
5119         }
5120     }
5121 
5122     #[test]
guest_mem_file_backed_mappings_overlap()5123     fn guest_mem_file_backed_mappings_overlap() {
5124         // Base case: no file mappings; output layout should be identical.
5125         assert_eq!(
5126             punch_holes_in_guest_mem_layout_for_mappings(
5127                 vec![
5128                     (GuestAddress(0), 0xD000_0000, Default::default()),
5129                     (GuestAddress(0x1_0000_0000), 0x8_0000, Default::default()),
5130                 ],
5131                 &[]
5132             )
5133             .unwrap(),
5134             vec![
5135                 (GuestAddress(0), 0xD000_0000, Default::default()),
5136                 (GuestAddress(0x1_0000_0000), 0x8_0000, Default::default()),
5137             ],
5138         );
5139 
5140         // File mapping that does not overlap guest memory.
5141         assert_eq!(
5142             punch_holes_in_guest_mem_layout_for_mappings(
5143                 vec![
5144                     (GuestAddress(0), 0xD000_0000, Default::default()),
5145                     (GuestAddress(0x1_0000_0000), 0x8_0000, Default::default()),
5146                 ],
5147                 &[test_file_backed_mapping(0xD000_0000, 0x1000)]
5148             )
5149             .unwrap_err()
5150             .to_string(),
5151             "RAM file-backed-mapping must be a subset of a RAM region",
5152         );
5153 
5154         // File mapping at the start of the low address space region.
5155         assert_eq!(
5156             punch_holes_in_guest_mem_layout_for_mappings(
5157                 vec![
5158                     (GuestAddress(0), 0xD000_0000, Default::default()),
5159                     (GuestAddress(0x1_0000_0000), 0x8_0000, Default::default()),
5160                 ],
5161                 &[test_file_backed_mapping(0, 0x2000)]
5162             )
5163             .unwrap(),
5164             vec![
5165                 (
5166                     GuestAddress(0),
5167                     0x2000,
5168                     MemoryRegionOptions::new()
5169                         .purpose(MemoryRegionPurpose::GuestMemoryRegion)
5170                         .file_backed(test_file_backed_mapping(0, 0x2000)),
5171                 ),
5172                 (
5173                     GuestAddress(0x2000),
5174                     0xD000_0000 - 0x2000,
5175                     Default::default()
5176                 ),
5177                 (GuestAddress(0x1_0000_0000), 0x8_0000, Default::default()),
5178             ],
5179         );
5180 
5181         // File mapping at the end of the low address space region.
5182         assert_eq!(
5183             punch_holes_in_guest_mem_layout_for_mappings(
5184                 vec![
5185                     (GuestAddress(0), 0xD000_0000, Default::default()),
5186                     (GuestAddress(0x1_0000_0000), 0x8_0000, Default::default()),
5187                 ],
5188                 &[test_file_backed_mapping(0xD000_0000 - 0x2000, 0x2000)]
5189             )
5190             .unwrap(),
5191             vec![
5192                 (GuestAddress(0), 0xD000_0000 - 0x2000, Default::default()),
5193                 (
5194                     GuestAddress(0xD000_0000 - 0x2000),
5195                     0x2000,
5196                     MemoryRegionOptions::new()
5197                         .purpose(MemoryRegionPurpose::GuestMemoryRegion)
5198                         .file_backed(test_file_backed_mapping(0xD000_0000 - 0x2000, 0x2000)),
5199                 ),
5200                 (GuestAddress(0x1_0000_0000), 0x8_0000, Default::default()),
5201             ],
5202         );
5203 
5204         // File mapping fully contained within the middle of the low address space region.
5205         assert_eq!(
5206             punch_holes_in_guest_mem_layout_for_mappings(
5207                 vec![
5208                     (GuestAddress(0), 0xD000_0000, Default::default()),
5209                     (GuestAddress(0x1_0000_0000), 0x8_0000, Default::default()),
5210                 ],
5211                 &[test_file_backed_mapping(0x1000, 0x2000)]
5212             )
5213             .unwrap(),
5214             vec![
5215                 (GuestAddress(0), 0x1000, Default::default()),
5216                 (
5217                     GuestAddress(0x1000),
5218                     0x2000,
5219                     MemoryRegionOptions::new()
5220                         .purpose(MemoryRegionPurpose::GuestMemoryRegion)
5221                         .file_backed(test_file_backed_mapping(0x1000, 0x2000)),
5222                 ),
5223                 (
5224                     GuestAddress(0x3000),
5225                     0xD000_0000 - 0x3000,
5226                     Default::default()
5227                 ),
5228                 (GuestAddress(0x1_0000_0000), 0x8_0000, Default::default()),
5229             ],
5230         );
5231 
5232         // File mapping at the start of the high address space region.
5233         assert_eq!(
5234             punch_holes_in_guest_mem_layout_for_mappings(
5235                 vec![
5236                     (GuestAddress(0), 0xD000_0000, Default::default()),
5237                     (GuestAddress(0x1_0000_0000), 0x8_0000, Default::default()),
5238                 ],
5239                 &[test_file_backed_mapping(0x1_0000_0000, 0x2000)]
5240             )
5241             .unwrap(),
5242             vec![
5243                 (GuestAddress(0), 0xD000_0000, Default::default()),
5244                 (
5245                     GuestAddress(0x1_0000_0000),
5246                     0x2000,
5247                     MemoryRegionOptions::new()
5248                         .purpose(MemoryRegionPurpose::GuestMemoryRegion)
5249                         .file_backed(test_file_backed_mapping(0x1_0000_0000, 0x2000)),
5250                 ),
5251                 (
5252                     GuestAddress(0x1_0000_2000),
5253                     0x8_0000 - 0x2000,
5254                     Default::default()
5255                 ),
5256             ],
5257         );
5258 
5259         // File mapping at the end of the high address space region.
5260         assert_eq!(
5261             punch_holes_in_guest_mem_layout_for_mappings(
5262                 vec![
5263                     (GuestAddress(0), 0xD000_0000, Default::default()),
5264                     (GuestAddress(0x1_0000_0000), 0x8_0000, Default::default()),
5265                 ],
5266                 &[test_file_backed_mapping(0x1_0008_0000 - 0x2000, 0x2000)]
5267             )
5268             .unwrap(),
5269             vec![
5270                 (GuestAddress(0), 0xD000_0000, Default::default()),
5271                 (
5272                     GuestAddress(0x1_0000_0000),
5273                     0x8_0000 - 0x2000,
5274                     Default::default()
5275                 ),
5276                 (
5277                     GuestAddress(0x1_0008_0000 - 0x2000),
5278                     0x2000,
5279                     MemoryRegionOptions::new()
5280                         .purpose(MemoryRegionPurpose::GuestMemoryRegion)
5281                         .file_backed(test_file_backed_mapping(0x1_0008_0000 - 0x2000, 0x2000)),
5282                 ),
5283             ],
5284         );
5285 
5286         // File mapping fully contained within the middle of the high address space region.
5287         assert_eq!(
5288             punch_holes_in_guest_mem_layout_for_mappings(
5289                 vec![
5290                     (GuestAddress(0), 0xD000_0000, Default::default()),
5291                     (GuestAddress(0x1_0000_0000), 0x8_0000, Default::default()),
5292                 ],
5293                 &[test_file_backed_mapping(0x1_0000_1000, 0x2000)]
5294             )
5295             .unwrap(),
5296             vec![
5297                 (GuestAddress(0), 0xD000_0000, Default::default()),
5298                 (GuestAddress(0x1_0000_0000), 0x1000, Default::default()),
5299                 (
5300                     GuestAddress(0x1_0000_1000),
5301                     0x2000,
5302                     MemoryRegionOptions::new()
5303                         .purpose(MemoryRegionPurpose::GuestMemoryRegion)
5304                         .file_backed(test_file_backed_mapping(0x1_0000_1000, 0x2000)),
5305                 ),
5306                 (
5307                     GuestAddress(0x1_0000_3000),
5308                     0x8_0000 - 0x3000,
5309                     Default::default()
5310                 ),
5311             ],
5312         );
5313 
5314         // File mapping overlapping two guest memory regions.
5315         assert_eq!(
5316             punch_holes_in_guest_mem_layout_for_mappings(
5317                 vec![
5318                     (GuestAddress(0), 0xD000_0000, Default::default()),
5319                     (GuestAddress(0x1_0000_0000), 0x8_0000, Default::default()),
5320                 ],
5321                 &[test_file_backed_mapping(0xA000_0000, 0x60002000)]
5322             )
5323             .unwrap_err()
5324             .to_string(),
5325             "RAM file-backed-mapping must be a subset of a RAM region",
5326         );
5327 
5328         // File mapping with different region purpose.
5329         assert_eq!(
5330             punch_holes_in_guest_mem_layout_for_mappings(
5331                 vec![
5332                     (GuestAddress(0x0000), 0x2000, Default::default()),
5333                     (
5334                         GuestAddress(0x2000),
5335                         0x2000,
5336                         MemoryRegionOptions::new().purpose(MemoryRegionPurpose::Bios)
5337                     ),
5338                 ],
5339                 &[test_file_backed_mapping(0x2000, 0x2000)]
5340             )
5341             .unwrap(),
5342             vec![
5343                 (GuestAddress(0x0000), 0x2000, Default::default()),
5344                 (
5345                     GuestAddress(0x2000),
5346                     0x2000,
5347                     MemoryRegionOptions::new()
5348                         .purpose(MemoryRegionPurpose::Bios)
5349                         .file_backed(test_file_backed_mapping(0x2000, 0x2000)),
5350                 ),
5351             ],
5352         );
5353     }
5354 
5355     #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
5356     #[test]
normalized_cpu_ipc_ratios_simple()5357     fn normalized_cpu_ipc_ratios_simple() {
5358         let host_max_freq = 5000000;
5359         let mut cpu_frequencies = BTreeMap::new();
5360         cpu_frequencies.insert(0, vec![100000, 200000, 500000]);
5361         cpu_frequencies.insert(1, vec![50000, 75000, 200000]);
5362 
5363         let mut cpu_ipc_ratio = BTreeMap::new();
5364         cpu_ipc_ratio.insert(0, 1024);
5365         cpu_ipc_ratio.insert(1, 512);
5366 
5367         let normalized_cpu_ipc_ratios = normalize_cpu_ipc_ratios(
5368             cpu_frequencies.iter().map(|(cpu_id, frequencies)| {
5369                 (
5370                     *cpu_id,
5371                     frequencies.iter().copied().max().unwrap_or_default(),
5372                 )
5373             }),
5374             host_max_freq,
5375             |cpu_id| cpu_ipc_ratio.get(&cpu_id).copied().unwrap_or(1024),
5376         )
5377         .expect("normalize_cpu_ipc_ratios failed");
5378 
5379         let ratios: Vec<(usize, u32)> = normalized_cpu_ipc_ratios.into_iter().collect();
5380         assert_eq!(ratios, vec![(0, 102), (1, 20)]);
5381     }
5382 }
5383