• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2022 The ChromiumOS Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #[cfg(target_os = "android")]
6 mod android;
7 pub mod cmdline;
8 pub mod config;
9 mod device_helpers;
10 #[cfg(feature = "gpu")]
11 pub(crate) mod gpu;
12 #[cfg(feature = "pci-hotplug")]
13 pub(crate) mod jail_warden;
14 #[cfg(feature = "pci-hotplug")]
15 pub(crate) mod pci_hotplug_helpers;
16 #[cfg(feature = "pci-hotplug")]
17 pub(crate) mod pci_hotplug_manager;
18 mod vcpu;
19 
20 use std::cmp::max;
21 use std::collections::BTreeMap;
22 use std::collections::BTreeSet;
23 #[cfg(feature = "registered_events")]
24 use std::collections::HashMap;
25 #[cfg(feature = "registered_events")]
26 use std::collections::HashSet;
27 use std::convert::TryInto;
28 use std::ffi::CString;
29 use std::fs::File;
30 use std::fs::OpenOptions;
31 #[cfg(feature = "registered_events")]
32 use std::hash::Hash;
33 use std::io::prelude::*;
34 use std::io::stdin;
35 use std::iter;
36 use std::mem;
37 #[cfg(target_arch = "x86_64")]
38 use std::ops::RangeInclusive;
39 use std::os::unix::prelude::OpenOptionsExt;
40 use std::os::unix::process::ExitStatusExt;
41 use std::path::Path;
42 use std::process;
43 #[cfg(feature = "registered_events")]
44 use std::rc::Rc;
45 use std::sync::mpsc;
46 use std::sync::Arc;
47 use std::sync::Barrier;
48 use std::thread::JoinHandle;
49 
50 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
51 use aarch64::AArch64 as Arch;
52 use acpi_tables::sdt::SDT;
53 use anyhow::anyhow;
54 use anyhow::bail;
55 use anyhow::Context;
56 use anyhow::Result;
57 use arch::DtbOverlay;
58 use arch::IrqChipArch;
59 use arch::LinuxArch;
60 use arch::RunnableLinuxVm;
61 use arch::VcpuAffinity;
62 use arch::VcpuArch;
63 use arch::VirtioDeviceStub;
64 use arch::VmArch;
65 use arch::VmComponents;
66 use arch::VmImage;
67 use argh::FromArgs;
68 use base::ReadNotifier;
69 #[cfg(feature = "balloon")]
70 use base::UnixSeqpacket;
71 use base::UnixSeqpacketListener;
72 use base::UnlinkUnixSeqpacketListener;
73 use base::*;
74 use cros_async::Executor;
75 use device_helpers::*;
76 use devices::create_devices_worker_thread;
77 use devices::serial_device::SerialHardware;
78 #[cfg(feature = "pvclock")]
79 use devices::tsc::get_tsc_sync_mitigations;
80 use devices::vfio::VfioCommonSetup;
81 use devices::vfio::VfioCommonTrait;
82 #[cfg(feature = "gpu")]
83 use devices::virtio;
84 #[cfg(any(feature = "video-decoder", feature = "video-encoder"))]
85 use devices::virtio::device_constants::video::VideoDeviceType;
86 #[cfg(feature = "gpu")]
87 use devices::virtio::gpu::EventDevice;
88 #[cfg(target_arch = "x86_64")]
89 use devices::virtio::memory_mapper::MemoryMapper;
90 use devices::virtio::memory_mapper::MemoryMapperTrait;
91 use devices::virtio::vhost::user::VhostUserListener;
92 use devices::virtio::vhost::user::VhostUserListenerTrait;
93 #[cfg(feature = "balloon")]
94 use devices::virtio::BalloonFeatures;
95 #[cfg(feature = "balloon")]
96 use devices::virtio::BalloonMode;
97 #[cfg(feature = "pci-hotplug")]
98 use devices::virtio::NetParameters;
99 #[cfg(feature = "pci-hotplug")]
100 use devices::virtio::NetParametersMode;
101 use devices::virtio::VirtioDevice;
102 use devices::virtio::VirtioDeviceType;
103 use devices::virtio::VirtioTransportType;
104 use devices::Bus;
105 use devices::BusDeviceObj;
106 use devices::BusType;
107 use devices::CoIommuDev;
108 #[cfg(feature = "usb")]
109 use devices::DeviceProvider;
110 #[cfg(target_arch = "x86_64")]
111 use devices::HotPlugBus;
112 #[cfg(target_arch = "x86_64")]
113 use devices::HotPlugKey;
114 use devices::IommuDevType;
115 use devices::IrqEventIndex;
116 use devices::IrqEventSource;
117 #[cfg(feature = "pci-hotplug")]
118 use devices::NetResourceCarrier;
119 #[cfg(target_arch = "x86_64")]
120 use devices::PciAddress;
121 #[cfg(target_arch = "x86_64")]
122 use devices::PciBridge;
123 use devices::PciDevice;
124 #[cfg(target_arch = "x86_64")]
125 use devices::PciMmioMapper;
126 #[cfg(target_arch = "x86_64")]
127 use devices::PciRoot;
128 #[cfg(target_arch = "x86_64")]
129 use devices::PciRootCommand;
130 #[cfg(target_arch = "x86_64")]
131 use devices::PcieDownstreamPort;
132 #[cfg(target_arch = "x86_64")]
133 use devices::PcieHostPort;
134 #[cfg(target_arch = "x86_64")]
135 use devices::PcieRootPort;
136 #[cfg(target_arch = "x86_64")]
137 use devices::PcieUpstreamPort;
138 use devices::PvPanicCode;
139 use devices::PvPanicPciDevice;
140 #[cfg(feature = "pci-hotplug")]
141 use devices::ResourceCarrier;
142 use devices::StubPciDevice;
143 use devices::VirtioMmioDevice;
144 use devices::VirtioPciDevice;
145 #[cfg(feature = "usb")]
146 use devices::XhciController;
147 #[cfg(feature = "gpu")]
148 use gpu::*;
149 #[cfg(target_arch = "riscv64")]
150 use hypervisor::CpuConfigRiscv64;
151 #[cfg(target_arch = "x86_64")]
152 use hypervisor::CpuConfigX86_64;
153 use hypervisor::Hypervisor;
154 use hypervisor::HypervisorCap;
155 use hypervisor::MemCacheType;
156 use hypervisor::ProtectionType;
157 use hypervisor::Vm;
158 use hypervisor::VmCap;
159 use jail::*;
160 #[cfg(feature = "pci-hotplug")]
161 use jail_warden::JailWarden;
162 #[cfg(feature = "pci-hotplug")]
163 use jail_warden::JailWardenImpl;
164 #[cfg(feature = "pci-hotplug")]
165 use jail_warden::PermissiveJailWarden;
166 use libc;
167 use metrics::MetricsController;
168 use minijail::Minijail;
169 #[cfg(feature = "pci-hotplug")]
170 use pci_hotplug_manager::PciHotPlugManager;
171 use resources::AddressRange;
172 use resources::Alloc;
173 use resources::SystemAllocator;
174 #[cfg(target_arch = "riscv64")]
175 use riscv64::Riscv64 as Arch;
176 use rutabaga_gfx::RutabagaGralloc;
177 use rutabaga_gfx::RutabagaGrallocBackendFlags;
178 use smallvec::SmallVec;
179 #[cfg(feature = "swap")]
180 use swap::SwapController;
181 use sync::Condvar;
182 use sync::Mutex;
183 use vm_control::api::VmMemoryClient;
184 use vm_control::*;
185 use vm_memory::GuestAddress;
186 use vm_memory::GuestMemory;
187 use vm_memory::MemoryPolicy;
188 use vm_memory::MemoryRegionOptions;
189 #[cfg(target_arch = "x86_64")]
190 use x86_64::X8664arch as Arch;
191 
192 use crate::crosvm::config::Config;
193 use crate::crosvm::config::Executable;
194 use crate::crosvm::config::FileBackedMappingParameters;
195 use crate::crosvm::config::HypervisorKind;
196 use crate::crosvm::config::InputDeviceOption;
197 use crate::crosvm::config::IrqChipKind;
198 use crate::crosvm::config::DEFAULT_TOUCH_DEVICE_HEIGHT;
199 use crate::crosvm::config::DEFAULT_TOUCH_DEVICE_WIDTH;
200 #[cfg(feature = "gdb")]
201 use crate::crosvm::gdb::gdb_thread;
202 #[cfg(feature = "gdb")]
203 use crate::crosvm::gdb::GdbStub;
204 #[cfg(target_arch = "x86_64")]
205 use crate::crosvm::ratelimit::Ratelimit;
206 use crate::crosvm::sys::cmdline::DevicesCommand;
207 use crate::crosvm::sys::config::SharedDir;
208 use crate::crosvm::sys::config::SharedDirKind;
209 
210 const KVM_PATH: &str = "/dev/kvm";
211 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
212 #[cfg(feature = "geniezone")]
213 const GENIEZONE_PATH: &str = "/dev/gzvm";
214 #[cfg(all(any(target_arch = "arm", target_arch = "aarch64"), feature = "gunyah"))]
215 static GUNYAH_PATH: &str = "/dev/gunyah";
216 
create_virtio_devices( cfg: &Config, vm: &mut impl Vm, resources: &mut SystemAllocator, #[cfg_attr(not(feature = "gpu"), allow(unused_variables))] vm_evt_wrtube: &SendTube, #[cfg(feature = "balloon")] balloon_device_tube: Option<Tube>, #[cfg(feature = "balloon")] balloon_inflate_tube: Option<Tube>, #[cfg(feature = "balloon")] init_balloon_size: u64, #[cfg(feature = "balloon")] dynamic_mapping_device_tube: Option<Tube>, disk_device_tubes: &mut Vec<Tube>, pmem_device_tubes: &mut Vec<Tube>, fs_device_tubes: &mut Vec<Tube>, #[cfg(feature = "gpu")] gpu_control_tube: Tube, #[cfg(feature = "gpu")] render_server_fd: Option<SafeDescriptor>, #[cfg(feature = "gpu")] has_vfio_gfx_device: bool, #[cfg(feature = "registered_events")] registered_evt_q: &SendTube, #[cfg(feature = "pvclock")] pvclock_device_tube: Option<Tube>, ) -> DeviceResult<Vec<VirtioDeviceStub>>217 fn create_virtio_devices(
218     cfg: &Config,
219     vm: &mut impl Vm,
220     resources: &mut SystemAllocator,
221     #[cfg_attr(not(feature = "gpu"), allow(unused_variables))] vm_evt_wrtube: &SendTube,
222     #[cfg(feature = "balloon")] balloon_device_tube: Option<Tube>,
223     #[cfg(feature = "balloon")] balloon_inflate_tube: Option<Tube>,
224     #[cfg(feature = "balloon")] init_balloon_size: u64,
225     #[cfg(feature = "balloon")] dynamic_mapping_device_tube: Option<Tube>,
226     disk_device_tubes: &mut Vec<Tube>,
227     pmem_device_tubes: &mut Vec<Tube>,
228     fs_device_tubes: &mut Vec<Tube>,
229     #[cfg(feature = "gpu")] gpu_control_tube: Tube,
230     #[cfg(feature = "gpu")] render_server_fd: Option<SafeDescriptor>,
231     #[cfg(feature = "gpu")] has_vfio_gfx_device: bool,
232     #[cfg(feature = "registered_events")] registered_evt_q: &SendTube,
233     #[cfg(feature = "pvclock")] pvclock_device_tube: Option<Tube>,
234 ) -> DeviceResult<Vec<VirtioDeviceStub>> {
235     let mut devs = Vec::new();
236 
237     #[cfg(any(feature = "gpu", feature = "video-decoder", feature = "video-encoder"))]
238     let mut resource_bridges = Vec::<Tube>::new();
239 
240     if !cfg.wayland_socket_paths.is_empty() {
241         #[cfg_attr(not(feature = "gpu"), allow(unused_mut))]
242         let mut wl_resource_bridge = None::<Tube>;
243 
244         #[cfg(feature = "gpu")]
245         {
246             if cfg.gpu_parameters.is_some() {
247                 let (wl_socket, gpu_socket) = Tube::pair().context("failed to create tube")?;
248                 resource_bridges.push(gpu_socket);
249                 wl_resource_bridge = Some(wl_socket);
250             }
251         }
252 
253         devs.push(create_wayland_device(
254             cfg.protection_type,
255             &cfg.jail_config,
256             &cfg.wayland_socket_paths,
257             wl_resource_bridge,
258         )?);
259     }
260 
261     #[cfg(feature = "video-decoder")]
262     let video_dec_cfg = cfg
263         .video_dec
264         .iter()
265         .map(|config| {
266             let (video_tube, gpu_tube) =
267                 Tube::pair().expect("failed to create tube for video decoder");
268             resource_bridges.push(gpu_tube);
269             (video_tube, config.backend)
270         })
271         .collect::<Vec<_>>();
272 
273     #[cfg(feature = "video-encoder")]
274     let video_enc_cfg = cfg
275         .video_enc
276         .iter()
277         .map(|config| {
278             let (video_tube, gpu_tube) =
279                 Tube::pair().expect("failed to create tube for video encoder");
280             resource_bridges.push(gpu_tube);
281             (video_tube, config.backend)
282         })
283         .collect::<Vec<_>>();
284 
285     #[cfg(feature = "gpu")]
286     {
287         if let Some(gpu_parameters) = &cfg.gpu_parameters {
288             let mut event_devices = Vec::new();
289             if cfg.display_window_mouse {
290                 let display_param = if gpu_parameters.display_params.is_empty() {
291                     Default::default()
292                 } else {
293                     gpu_parameters.display_params[0].clone()
294                 };
295                 let (gpu_display_w, gpu_display_h) = display_param.get_virtual_display_size();
296 
297                 let (event_device_socket, virtio_dev_socket) =
298                     StreamChannel::pair(BlockingMode::Nonblocking, FramingMode::Byte)
299                         .context("failed to create socket")?;
300                 let mut multi_touch_width = gpu_display_w;
301                 let mut multi_touch_height = gpu_display_h;
302                 let mut multi_touch_name = None;
303                 for input in &cfg.virtio_input {
304                     if let InputDeviceOption::MultiTouch {
305                         width,
306                         height,
307                         name,
308                         ..
309                     } = input
310                     {
311                         if let Some(width) = width {
312                             multi_touch_width = *width;
313                         }
314                         if let Some(height) = height {
315                             multi_touch_height = *height;
316                         }
317                         if let Some(name) = name {
318                             multi_touch_name = Some(name.as_str());
319                         }
320                         break;
321                     }
322                 }
323                 let dev = virtio::input::new_multi_touch(
324                     // u32::MAX is the least likely to collide with the indices generated above for
325                     // the multi_touch options, which begin at 0.
326                     u32::MAX,
327                     virtio_dev_socket,
328                     multi_touch_width,
329                     multi_touch_height,
330                     multi_touch_name,
331                     virtio::base_features(cfg.protection_type),
332                 )
333                 .context("failed to set up mouse device")?;
334                 devs.push(VirtioDeviceStub {
335                     dev: Box::new(dev),
336                     jail: simple_jail(&cfg.jail_config, "input_device")?,
337                 });
338                 event_devices.push(EventDevice::touchscreen(event_device_socket));
339             }
340             if cfg.display_window_keyboard {
341                 let (event_device_socket, virtio_dev_socket) =
342                     StreamChannel::pair(BlockingMode::Nonblocking, FramingMode::Byte)
343                         .context("failed to create socket")?;
344                 let dev = virtio::input::new_keyboard(
345                     // u32::MAX is the least likely to collide with the indices generated above for
346                     // the multi_touch options, which begin at 0.
347                     u32::MAX,
348                     virtio_dev_socket,
349                     virtio::base_features(cfg.protection_type),
350                 )
351                 .context("failed to set up keyboard device")?;
352                 devs.push(VirtioDeviceStub {
353                     dev: Box::new(dev),
354                     jail: simple_jail(&cfg.jail_config, "input_device")?,
355                 });
356                 event_devices.push(EventDevice::keyboard(event_device_socket));
357             }
358 
359             devs.push(create_gpu_device(
360                 cfg,
361                 vm_evt_wrtube,
362                 gpu_control_tube,
363                 resource_bridges,
364                 render_server_fd,
365                 has_vfio_gfx_device,
366                 event_devices,
367             )?);
368         }
369     }
370 
371     for (_, param) in cfg.serial_parameters.iter().filter(|(_k, v)| {
372         v.hardware == SerialHardware::VirtioConsole
373             || v.hardware == SerialHardware::LegacyVirtioConsole
374     }) {
375         let dev = param.create_virtio_device_and_jail(cfg.protection_type, &cfg.jail_config)?;
376         devs.push(dev);
377     }
378 
379     for disk in &cfg.disks {
380         let disk_config = DiskConfig::new(disk, Some(disk_device_tubes.remove(0)));
381         devs.push(
382             disk_config.create_virtio_device_and_jail(cfg.protection_type, &cfg.jail_config)?,
383         );
384     }
385 
386     if !cfg.scsis.is_empty() {
387         let scsi_config = ScsiConfig(&cfg.scsis);
388         devs.push(
389             scsi_config.create_virtio_device_and_jail(cfg.protection_type, &cfg.jail_config)?,
390         );
391     }
392 
393     for (index, pmem_disk) in cfg.pmem_devices.iter().enumerate() {
394         let pmem_device_tube = pmem_device_tubes.remove(0);
395         devs.push(create_pmem_device(
396             cfg.protection_type,
397             &cfg.jail_config,
398             vm,
399             resources,
400             pmem_disk,
401             index,
402             pmem_device_tube,
403         )?);
404     }
405 
406     if cfg.rng {
407         devs.push(create_rng_device(cfg.protection_type, &cfg.jail_config)?);
408     }
409 
410     #[cfg(feature = "pvclock")]
411     if let Some(suspend_tube) = pvclock_device_tube {
412         let tsc_state = devices::tsc::tsc_state()?;
413         let tsc_sync_mitigations =
414             get_tsc_sync_mitigations(&tsc_state, cfg.vcpu_count.unwrap_or(1));
415         if tsc_state.core_grouping.size() > 1 {
416             // Host TSCs are not in sync. Log what mitigations are applied.
417             warn!(
418                 "Host TSCs are not in sync, applying the following mitigations: {:?}",
419                 tsc_sync_mitigations
420             );
421         }
422         devs.push(create_pvclock_device(
423             cfg.protection_type,
424             &cfg.jail_config,
425             tsc_state.frequency,
426             suspend_tube,
427         )?);
428         info!("virtio-pvclock is enabled for this vm");
429     }
430 
431     #[cfg(feature = "vtpm")]
432     {
433         if cfg.vtpm_proxy {
434             devs.push(create_vtpm_proxy_device(
435                 cfg.protection_type,
436                 &cfg.jail_config,
437             )?);
438         }
439     }
440 
441     let mut keyboard_idx = 0;
442     let mut mouse_idx = 0;
443     let mut rotary_idx = 0;
444     let mut switches_idx = 0;
445     let mut multi_touch_idx = 0;
446     let mut single_touch_idx = 0;
447     let mut trackpad_idx = 0;
448     for input in &cfg.virtio_input {
449         let input_dev = match input {
450             InputDeviceOption::Evdev { path } => {
451                 create_vinput_device(cfg.protection_type, &cfg.jail_config, path.as_path())?
452             }
453             InputDeviceOption::Keyboard { path } => {
454                 let dev = create_keyboard_device(
455                     cfg.protection_type,
456                     &cfg.jail_config,
457                     path.as_path(),
458                     keyboard_idx,
459                 )?;
460                 keyboard_idx += 1;
461                 dev
462             }
463             InputDeviceOption::Mouse { path } => {
464                 let dev = create_mouse_device(
465                     cfg.protection_type,
466                     &cfg.jail_config,
467                     path.as_path(),
468                     mouse_idx,
469                 )?;
470                 mouse_idx += 1;
471                 dev
472             }
473             InputDeviceOption::MultiTouch {
474                 path,
475                 width,
476                 height,
477                 name,
478             } => {
479                 let mut width = *width;
480                 let mut height = *height;
481                 if multi_touch_idx == 0 {
482                     if width.is_none() {
483                         width = cfg.display_input_width;
484                     }
485                     if height.is_none() {
486                         height = cfg.display_input_height;
487                     }
488                 }
489                 let dev = create_multi_touch_device(
490                     cfg.protection_type,
491                     &cfg.jail_config,
492                     path.as_path(),
493                     width.unwrap_or(DEFAULT_TOUCH_DEVICE_WIDTH),
494                     height.unwrap_or(DEFAULT_TOUCH_DEVICE_HEIGHT),
495                     name.as_deref(),
496                     multi_touch_idx,
497                 )?;
498                 multi_touch_idx += 1;
499                 dev
500             }
501             InputDeviceOption::Rotary { path } => {
502                 let dev = create_rotary_device(
503                     cfg.protection_type,
504                     &cfg.jail_config,
505                     path.as_path(),
506                     rotary_idx,
507                 )?;
508                 rotary_idx += 1;
509                 dev
510             }
511             InputDeviceOption::SingleTouch {
512                 path,
513                 width,
514                 height,
515                 name,
516             } => {
517                 let mut width = *width;
518                 let mut height = *height;
519                 if single_touch_idx == 0 {
520                     if width.is_none() {
521                         width = cfg.display_input_width;
522                     }
523                     if height.is_none() {
524                         height = cfg.display_input_height;
525                     }
526                 }
527                 let dev = create_single_touch_device(
528                     cfg.protection_type,
529                     &cfg.jail_config,
530                     path.as_path(),
531                     width.unwrap_or(DEFAULT_TOUCH_DEVICE_WIDTH),
532                     height.unwrap_or(DEFAULT_TOUCH_DEVICE_HEIGHT),
533                     name.as_deref(),
534                     single_touch_idx,
535                 )?;
536                 single_touch_idx += 1;
537                 dev
538             }
539             InputDeviceOption::Switches { path } => {
540                 let dev = create_switches_device(
541                     cfg.protection_type,
542                     &cfg.jail_config,
543                     path.as_path(),
544                     switches_idx,
545                 )?;
546                 switches_idx += 1;
547                 dev
548             }
549             InputDeviceOption::Trackpad {
550                 path,
551                 width,
552                 height,
553                 name,
554             } => {
555                 let dev = create_trackpad_device(
556                     cfg.protection_type,
557                     &cfg.jail_config,
558                     path.as_path(),
559                     width.unwrap_or(DEFAULT_TOUCH_DEVICE_WIDTH),
560                     height.unwrap_or(DEFAULT_TOUCH_DEVICE_HEIGHT),
561                     name.as_deref(),
562                     trackpad_idx,
563                 )?;
564                 trackpad_idx += 1;
565                 dev
566             }
567         };
568         devs.push(input_dev);
569     }
570 
571     #[cfg(feature = "balloon")]
572     if let (Some(balloon_device_tube), Some(dynamic_mapping_device_tube)) =
573         (balloon_device_tube, dynamic_mapping_device_tube)
574     {
575         let balloon_features = (cfg.balloon_page_reporting as u64)
576             << BalloonFeatures::PageReporting as u64
577             | (cfg.balloon_ws_reporting as u64) << BalloonFeatures::WSReporting as u64;
578         devs.push(create_balloon_device(
579             cfg.protection_type,
580             &cfg.jail_config,
581             if cfg.strict_balloon {
582                 BalloonMode::Strict
583             } else {
584                 BalloonMode::Relaxed
585             },
586             balloon_device_tube,
587             balloon_inflate_tube,
588             init_balloon_size,
589             dynamic_mapping_device_tube,
590             balloon_features,
591             #[cfg(feature = "registered_events")]
592             Some(
593                 registered_evt_q
594                     .try_clone()
595                     .context("failed to clone registered_evt_q tube")?,
596             ),
597             cfg.balloon_ws_num_bins,
598         )?);
599     }
600 
601     #[cfg(feature = "net")]
602     for opt in &cfg.net {
603         let dev = opt.create_virtio_device_and_jail(cfg.protection_type, &cfg.jail_config)?;
604         devs.push(dev);
605     }
606 
607     #[cfg(feature = "audio")]
608     {
609         for virtio_snd in &cfg.virtio_snds {
610             devs.push(create_virtio_snd_device(
611                 cfg.protection_type,
612                 &cfg.jail_config,
613                 virtio_snd.clone(),
614             )?);
615         }
616     }
617 
618     #[cfg(feature = "video-decoder")]
619     {
620         for (tube, backend) in video_dec_cfg {
621             register_video_device(
622                 backend,
623                 &mut devs,
624                 tube,
625                 cfg.protection_type,
626                 &cfg.jail_config,
627                 VideoDeviceType::Decoder,
628             )?;
629         }
630     }
631 
632     #[cfg(feature = "video-encoder")]
633     {
634         for (tube, backend) in video_enc_cfg {
635             register_video_device(
636                 backend,
637                 &mut devs,
638                 tube,
639                 cfg.protection_type,
640                 &cfg.jail_config,
641                 VideoDeviceType::Encoder,
642             )?;
643         }
644     }
645 
646     if let Some(vsock_config) = &cfg.vsock {
647         devs.push(
648             vsock_config.create_virtio_device_and_jail(cfg.protection_type, &cfg.jail_config)?,
649         );
650     }
651 
652     #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
653     {
654         if cfg.vhost_scmi {
655             devs.push(create_vhost_scmi_device(
656                 cfg.protection_type,
657                 &cfg.jail_config,
658                 cfg.vhost_scmi_device.clone(),
659             )?);
660         }
661     }
662     for vhost_user_fs in &cfg.vhost_user_fs {
663         devs.push(create_vhost_user_fs_device(
664             cfg.protection_type,
665             vhost_user_fs,
666         )?);
667     }
668 
669     for shared_dir in &cfg.shared_dirs {
670         let SharedDir {
671             src,
672             tag,
673             kind,
674             ugid,
675             uid_map,
676             gid_map,
677             fs_cfg,
678             p9_cfg,
679         } = shared_dir;
680 
681         let dev = match kind {
682             SharedDirKind::FS => {
683                 let device_tube = fs_device_tubes.remove(0);
684                 create_fs_device(
685                     cfg.protection_type,
686                     &cfg.jail_config,
687                     *ugid,
688                     uid_map,
689                     gid_map,
690                     src,
691                     tag,
692                     fs_cfg.clone(),
693                     device_tube,
694                 )?
695             }
696             SharedDirKind::P9 => create_9p_device(
697                 cfg.protection_type,
698                 &cfg.jail_config,
699                 *ugid,
700                 uid_map,
701                 gid_map,
702                 src,
703                 tag,
704                 p9_cfg.clone(),
705             )?,
706         };
707         devs.push(dev);
708     }
709 
710     #[cfg(feature = "audio")]
711     if let Some(path) = &cfg.sound {
712         devs.push(create_sound_device(
713             path,
714             cfg.protection_type,
715             &cfg.jail_config,
716         )?);
717     }
718 
719     for opt in &cfg.vhost_user {
720         devs.push(create_vhost_user_frontend(cfg.protection_type, opt)?);
721     }
722 
723     Ok(devs)
724 }
725 
create_devices( cfg: &Config, vm: &mut impl Vm, resources: &mut SystemAllocator, vm_evt_wrtube: &SendTube, iommu_attached_endpoints: &mut BTreeMap<u32, Arc<Mutex<Box<dyn MemoryMapperTrait>>>>, irq_control_tubes: &mut Vec<Tube>, vm_memory_control_tubes: &mut Vec<VmMemoryTube>, control_tubes: &mut Vec<TaggedControlTube>, #[cfg(feature = "balloon")] balloon_device_tube: Option<Tube>, #[cfg(feature = "balloon")] init_balloon_size: u64, #[cfg(feature = "balloon")] dynamic_mapping_device_tube: Option<Tube>, disk_device_tubes: &mut Vec<Tube>, pmem_device_tubes: &mut Vec<Tube>, fs_device_tubes: &mut Vec<Tube>, #[cfg(feature = "usb")] usb_provider: DeviceProvider, #[cfg(feature = "gpu")] gpu_control_tube: Tube, #[cfg(feature = "gpu")] render_server_fd: Option<SafeDescriptor>, iova_max_addr: &mut Option<u64>, #[cfg(feature = "registered_events")] registered_evt_q: &SendTube, #[cfg(feature = "pvclock")] pvclock_device_tube: Option<Tube>, ) -> DeviceResult<Vec<(Box<dyn BusDeviceObj>, Option<Minijail>)>>726 fn create_devices(
727     cfg: &Config,
728     vm: &mut impl Vm,
729     resources: &mut SystemAllocator,
730     vm_evt_wrtube: &SendTube,
731     iommu_attached_endpoints: &mut BTreeMap<u32, Arc<Mutex<Box<dyn MemoryMapperTrait>>>>,
732     irq_control_tubes: &mut Vec<Tube>,
733     vm_memory_control_tubes: &mut Vec<VmMemoryTube>,
734     control_tubes: &mut Vec<TaggedControlTube>,
735     #[cfg(feature = "balloon")] balloon_device_tube: Option<Tube>,
736     #[cfg(feature = "balloon")] init_balloon_size: u64,
737     #[cfg(feature = "balloon")] dynamic_mapping_device_tube: Option<Tube>,
738     disk_device_tubes: &mut Vec<Tube>,
739     pmem_device_tubes: &mut Vec<Tube>,
740     fs_device_tubes: &mut Vec<Tube>,
741     #[cfg(feature = "usb")] usb_provider: DeviceProvider,
742     #[cfg(feature = "gpu")] gpu_control_tube: Tube,
743     #[cfg(feature = "gpu")] render_server_fd: Option<SafeDescriptor>,
744     iova_max_addr: &mut Option<u64>,
745     #[cfg(feature = "registered_events")] registered_evt_q: &SendTube,
746     #[cfg(feature = "pvclock")] pvclock_device_tube: Option<Tube>,
747 ) -> DeviceResult<Vec<(Box<dyn BusDeviceObj>, Option<Minijail>)>> {
748     let mut devices: Vec<(Box<dyn BusDeviceObj>, Option<Minijail>)> = Vec::new();
749     #[cfg(feature = "balloon")]
750     let mut balloon_inflate_tube: Option<Tube> = None;
751     #[cfg(feature = "gpu")]
752     let mut has_vfio_gfx_device = false;
753     if !cfg.vfio.is_empty() {
754         let mut coiommu_attached_endpoints = Vec::new();
755 
756         for vfio_dev in &cfg.vfio {
757             let (dev, jail, viommu_mapper) = create_vfio_device(
758                 &cfg.jail_config,
759                 vm,
760                 resources,
761                 irq_control_tubes,
762                 vm_memory_control_tubes,
763                 control_tubes,
764                 &vfio_dev.path,
765                 false,
766                 None,
767                 vfio_dev.guest_address,
768                 Some(&mut coiommu_attached_endpoints),
769                 vfio_dev.iommu,
770                 vfio_dev.dt_symbol.clone(),
771             )?;
772             match dev {
773                 VfioDeviceVariant::Pci(vfio_pci_device) => {
774                     *iova_max_addr = Some(max(
775                         vfio_pci_device.get_max_iova(),
776                         iova_max_addr.unwrap_or(0),
777                     ));
778 
779                     #[cfg(feature = "gpu")]
780                     if vfio_pci_device.is_gfx() {
781                         has_vfio_gfx_device = true;
782                     }
783 
784                     if let Some(viommu_mapper) = viommu_mapper {
785                         iommu_attached_endpoints.insert(
786                             vfio_pci_device
787                                 .pci_address()
788                                 .context("not initialized")?
789                                 .to_u32(),
790                             Arc::new(Mutex::new(Box::new(viommu_mapper))),
791                         );
792                     }
793 
794                     devices.push((Box::new(vfio_pci_device), jail));
795                 }
796                 VfioDeviceVariant::Platform(vfio_plat_dev) => {
797                     devices.push((Box::new(vfio_plat_dev), jail));
798                 }
799             }
800         }
801 
802         if !coiommu_attached_endpoints.is_empty() || !iommu_attached_endpoints.is_empty() {
803             let mut buf = mem::MaybeUninit::<libc::rlimit64>::zeroed();
804             // SAFETY: trivially safe
805             let res = unsafe { libc::getrlimit64(libc::RLIMIT_MEMLOCK, buf.as_mut_ptr()) };
806             if res == 0 {
807                 // SAFETY: safe because getrlimit64 has returned success.
808                 let limit = unsafe { buf.assume_init() };
809                 let rlim_new = limit.rlim_cur.saturating_add(vm.get_memory().memory_size());
810                 let rlim_max = max(limit.rlim_max, rlim_new);
811                 if limit.rlim_cur < rlim_new {
812                     let limit_arg = libc::rlimit64 {
813                         rlim_cur: rlim_new,
814                         rlim_max,
815                     };
816                     // SAFETY: trivially safe
817                     let res = unsafe { libc::setrlimit64(libc::RLIMIT_MEMLOCK, &limit_arg) };
818                     if res != 0 {
819                         bail!("Set rlimit failed");
820                     }
821                 }
822             } else {
823                 bail!("Get rlimit failed");
824             }
825         }
826         #[cfg(feature = "balloon")]
827         let coiommu_tube: Option<Tube>;
828         #[cfg(not(feature = "balloon"))]
829         let coiommu_tube: Option<Tube> = None;
830         if !coiommu_attached_endpoints.is_empty() {
831             let vfio_container =
832                 VfioCommonSetup::vfio_get_container(IommuDevType::CoIommu, None as Option<&Path>)
833                     .context("failed to get vfio container")?;
834             let (coiommu_host_tube, coiommu_device_tube) =
835                 Tube::pair().context("failed to create coiommu tube")?;
836             vm_memory_control_tubes.push(VmMemoryTube {
837                 tube: coiommu_host_tube,
838                 expose_with_viommu: false,
839             });
840             let vcpu_count = cfg.vcpu_count.unwrap_or(1) as u64;
841             #[cfg(feature = "balloon")]
842             match Tube::pair() {
843                 Ok((x, y)) => {
844                     coiommu_tube = Some(x);
845                     balloon_inflate_tube = Some(y);
846                 }
847                 Err(x) => return Err(x).context("failed to create coiommu tube"),
848             }
849             let dev = CoIommuDev::new(
850                 vm.get_memory().clone(),
851                 vfio_container,
852                 VmMemoryClient::new(coiommu_device_tube),
853                 coiommu_tube,
854                 coiommu_attached_endpoints,
855                 vcpu_count,
856                 cfg.coiommu_param.unwrap_or_default(),
857             )
858             .context("failed to create coiommu device")?;
859 
860             devices.push((
861                 Box::new(dev),
862                 simple_jail(&cfg.jail_config, "coiommu_device")?,
863             ));
864         }
865     }
866 
867     let stubs = create_virtio_devices(
868         cfg,
869         vm,
870         resources,
871         vm_evt_wrtube,
872         #[cfg(feature = "balloon")]
873         balloon_device_tube,
874         #[cfg(feature = "balloon")]
875         balloon_inflate_tube,
876         #[cfg(feature = "balloon")]
877         init_balloon_size,
878         #[cfg(feature = "balloon")]
879         dynamic_mapping_device_tube,
880         disk_device_tubes,
881         pmem_device_tubes,
882         fs_device_tubes,
883         #[cfg(feature = "gpu")]
884         gpu_control_tube,
885         #[cfg(feature = "gpu")]
886         render_server_fd,
887         #[cfg(feature = "gpu")]
888         has_vfio_gfx_device,
889         #[cfg(feature = "registered_events")]
890         registered_evt_q,
891         #[cfg(feature = "pvclock")]
892         pvclock_device_tube,
893     )?;
894 
895     for stub in stubs {
896         match stub.dev.transport_type() {
897             VirtioTransportType::Pci => {
898                 let (msi_host_tube, msi_device_tube) =
899                     Tube::pair().context("failed to create tube")?;
900                 irq_control_tubes.push(msi_host_tube);
901 
902                 let shared_memory_tube = if stub.dev.get_shared_memory_region().is_some() {
903                     let (host_tube, device_tube) =
904                         Tube::pair().context("failed to create shared memory tube")?;
905                     vm_memory_control_tubes.push(VmMemoryTube {
906                         tube: host_tube,
907                         expose_with_viommu: stub.dev.expose_shmem_descriptors_with_viommu(),
908                     });
909                     Some(device_tube)
910                 } else {
911                     None
912                 };
913 
914                 let (ioevent_host_tube, ioevent_device_tube) =
915                     Tube::pair().context("failed to create ioevent tube")?;
916                 vm_memory_control_tubes.push(VmMemoryTube {
917                     tube: ioevent_host_tube,
918                     expose_with_viommu: false,
919                 });
920 
921                 let (host_tube, device_tube) =
922                     Tube::pair().context("failed to create device control tube")?;
923                 control_tubes.push(TaggedControlTube::Vm(host_tube));
924 
925                 let dev = VirtioPciDevice::new(
926                     vm.get_memory().clone(),
927                     stub.dev,
928                     msi_device_tube,
929                     cfg.disable_virtio_intx,
930                     shared_memory_tube.map(VmMemoryClient::new),
931                     VmMemoryClient::new(ioevent_device_tube),
932                     device_tube,
933                 )
934                 .context("failed to create virtio pci dev")?;
935 
936                 devices.push((Box::new(dev) as Box<dyn BusDeviceObj>, stub.jail));
937             }
938             VirtioTransportType::Mmio => {
939                 let dev = VirtioMmioDevice::new(vm.get_memory().clone(), stub.dev, false)
940                     .context("failed to create virtio mmio dev")?;
941                 devices.push((Box::new(dev) as Box<dyn BusDeviceObj>, stub.jail));
942             }
943         }
944     }
945 
946     #[cfg(feature = "usb")]
947     if cfg.usb {
948         // Create xhci controller.
949         let usb_controller = Box::new(XhciController::new(
950             vm.get_memory().clone(),
951             Box::new(usb_provider),
952         ));
953         devices.push((
954             usb_controller,
955             simple_jail(&cfg.jail_config, "xhci_device")?,
956         ));
957     }
958 
959     for params in &cfg.stub_pci_devices {
960         // Stub devices don't need jailing since they don't do anything.
961         devices.push((Box::new(StubPciDevice::new(params)), None));
962     }
963 
964     devices.push((
965         Box::new(PvPanicPciDevice::new(vm_evt_wrtube.try_clone()?)),
966         None,
967     ));
968 
969     Ok(devices)
970 }
971 
create_file_backed_mappings( cfg: &Config, vm: &mut impl Vm, resources: &mut SystemAllocator, ) -> Result<()>972 fn create_file_backed_mappings(
973     cfg: &Config,
974     vm: &mut impl Vm,
975     resources: &mut SystemAllocator,
976 ) -> Result<()> {
977     for mapping in &cfg.file_backed_mappings {
978         let file = OpenOptions::new()
979             .read(true)
980             .write(mapping.writable)
981             .custom_flags(if mapping.sync { libc::O_SYNC } else { 0 })
982             .open(&mapping.path)
983             .context("failed to open file for file-backed mapping")?;
984         let prot = if mapping.writable {
985             Protection::read_write()
986         } else {
987             Protection::read()
988         };
989         let size = mapping
990             .size
991             .try_into()
992             .context("Invalid size for file-backed mapping")?;
993         let memory_mapping = MemoryMappingBuilder::new(size)
994             .from_file(&file)
995             .offset(mapping.offset)
996             .protection(prot)
997             .build()
998             .context("failed to map backing file for file-backed mapping")?;
999 
1000         let mapping_range = AddressRange::from_start_and_size(mapping.address, mapping.size)
1001             .context("failed to convert to AddressRange")?;
1002         match resources.mmio_allocator_any().allocate_at(
1003             mapping_range,
1004             Alloc::FileBacked(mapping.address),
1005             "file-backed mapping".to_owned(),
1006         ) {
1007             // OutOfSpace just means that this mapping is not in the MMIO regions at all, so don't
1008             // consider it an error.
1009             // TODO(b/222769529): Reserve this region in a global memory address space allocator
1010             // once we have that so nothing else can accidentally overlap with it.
1011             Ok(()) | Err(resources::Error::OutOfSpace) => {}
1012             e => e.context("failed to allocate guest address for file-backed mapping")?,
1013         }
1014 
1015         vm.add_memory_region(
1016             GuestAddress(mapping.address),
1017             Box::new(memory_mapping),
1018             !mapping.writable,
1019             /* log_dirty_pages = */ false,
1020             MemCacheType::CacheCoherent,
1021         )
1022         .context("failed to configure file-backed mapping")?;
1023     }
1024 
1025     Ok(())
1026 }
1027 
1028 #[cfg(target_arch = "x86_64")]
1029 /// Collection of devices related to PCI hotplug.
1030 struct HotPlugStub {
1031     /// Map from bus index to hotplug bus.
1032     hotplug_buses: BTreeMap<u8, Arc<Mutex<dyn HotPlugBus>>>,
1033     /// Bus ranges of devices for virtio-iommu.
1034     iommu_bus_ranges: Vec<RangeInclusive<u32>>,
1035     /// Map from gpe index to GpeNotify devices.
1036     gpe_notify_devs: BTreeMap<u32, Arc<Mutex<dyn GpeNotify>>>,
1037     /// Map from bus index to GpeNotify devices.
1038     pme_notify_devs: BTreeMap<u8, Arc<Mutex<dyn PmeNotify>>>,
1039 }
1040 
1041 #[cfg(target_arch = "x86_64")]
1042 impl HotPlugStub {
1043     /// Constructs empty HotPlugStub.
new() -> Self1044     fn new() -> Self {
1045         Self {
1046             hotplug_buses: BTreeMap::new(),
1047             iommu_bus_ranges: Vec::new(),
1048             gpe_notify_devs: BTreeMap::new(),
1049             pme_notify_devs: BTreeMap::new(),
1050         }
1051     }
1052 }
1053 
1054 #[cfg(target_arch = "x86_64")]
1055 /// Creates PCIE root port with only virtual devices.
1056 ///
1057 /// user doesn't specify host pcie root port which link to this virtual pcie rp,
1058 /// find the empty bus and create a total virtual pcie rp
create_pure_virtual_pcie_root_port( sys_allocator: &mut SystemAllocator, irq_control_tubes: &mut Vec<Tube>, devices: &mut Vec<(Box<dyn BusDeviceObj>, Option<Minijail>)>, hp_bus_count: u8, ) -> Result<HotPlugStub>1059 fn create_pure_virtual_pcie_root_port(
1060     sys_allocator: &mut SystemAllocator,
1061     irq_control_tubes: &mut Vec<Tube>,
1062     devices: &mut Vec<(Box<dyn BusDeviceObj>, Option<Minijail>)>,
1063     hp_bus_count: u8,
1064 ) -> Result<HotPlugStub> {
1065     let mut hp_sec_buses = Vec::new();
1066     let mut hp_stub = HotPlugStub::new();
1067     // Create Pcie Root Port for non-root buses, each non-root bus device will be
1068     // connected behind a virtual pcie root port.
1069     for i in 1..255 {
1070         if sys_allocator.pci_bus_empty(i) {
1071             if hp_sec_buses.len() < hp_bus_count.into() {
1072                 hp_sec_buses.push(i);
1073             }
1074             continue;
1075         }
1076         let pcie_root_port = Arc::new(Mutex::new(PcieRootPort::new(i, false)));
1077         hp_stub
1078             .pme_notify_devs
1079             .insert(i, pcie_root_port.clone() as Arc<Mutex<dyn PmeNotify>>);
1080         let (msi_host_tube, msi_device_tube) = Tube::pair().context("failed to create tube")?;
1081         irq_control_tubes.push(msi_host_tube);
1082         let pci_bridge = Box::new(PciBridge::new(pcie_root_port.clone(), msi_device_tube));
1083         // no ipc is used if the root port disables hotplug
1084         devices.push((pci_bridge, None));
1085     }
1086 
1087     // Create Pcie Root Port for hot-plug
1088     if hp_sec_buses.len() < hp_bus_count.into() {
1089         return Err(anyhow!("no more addresses are available"));
1090     }
1091 
1092     for hp_sec_bus in hp_sec_buses {
1093         let pcie_root_port = Arc::new(Mutex::new(PcieRootPort::new(hp_sec_bus, true)));
1094         hp_stub.pme_notify_devs.insert(
1095             hp_sec_bus,
1096             pcie_root_port.clone() as Arc<Mutex<dyn PmeNotify>>,
1097         );
1098         let (msi_host_tube, msi_device_tube) = Tube::pair().context("failed to create tube")?;
1099         irq_control_tubes.push(msi_host_tube);
1100         let pci_bridge = Box::new(PciBridge::new(pcie_root_port.clone(), msi_device_tube));
1101 
1102         hp_stub.iommu_bus_ranges.push(RangeInclusive::new(
1103             PciAddress {
1104                 bus: pci_bridge.get_secondary_num(),
1105                 dev: 0,
1106                 func: 0,
1107             }
1108             .to_u32(),
1109             PciAddress {
1110                 bus: pci_bridge.get_subordinate_num(),
1111                 dev: 32,
1112                 func: 8,
1113             }
1114             .to_u32(),
1115         ));
1116 
1117         devices.push((pci_bridge, None));
1118         hp_stub
1119             .hotplug_buses
1120             .insert(hp_sec_bus, pcie_root_port as Arc<Mutex<dyn HotPlugBus>>);
1121     }
1122     Ok(hp_stub)
1123 }
1124 
setup_vm_components(cfg: &Config) -> Result<VmComponents>1125 fn setup_vm_components(cfg: &Config) -> Result<VmComponents> {
1126     let initrd_image = if let Some(initrd_path) = &cfg.initrd_path {
1127         Some(
1128             open_file_or_duplicate(initrd_path, OpenOptions::new().read(true))
1129                 .with_context(|| format!("failed to open initrd {}", initrd_path.display()))?,
1130         )
1131     } else {
1132         None
1133     };
1134     let pvm_fw_image = if let Some(pvm_fw_path) = &cfg.pvm_fw {
1135         Some(
1136             open_file_or_duplicate(pvm_fw_path, OpenOptions::new().read(true))
1137                 .with_context(|| format!("failed to open pvm_fw {}", pvm_fw_path.display()))?,
1138         )
1139     } else {
1140         None
1141     };
1142 
1143     let vm_image = match cfg.executable_path {
1144         Some(Executable::Kernel(ref kernel_path)) => VmImage::Kernel(
1145             open_file_or_duplicate(kernel_path, OpenOptions::new().read(true)).with_context(
1146                 || format!("failed to open kernel image {}", kernel_path.display()),
1147             )?,
1148         ),
1149         Some(Executable::Bios(ref bios_path)) => VmImage::Bios(
1150             open_file_or_duplicate(bios_path, OpenOptions::new().read(true))
1151                 .with_context(|| format!("failed to open bios {}", bios_path.display()))?,
1152         ),
1153         _ => panic!("Did not receive a bios or kernel, should be impossible."),
1154     };
1155 
1156     let swiotlb = if let Some(size) = cfg.swiotlb {
1157         Some(
1158             size.checked_mul(1024 * 1024)
1159                 .ok_or_else(|| anyhow!("requested swiotlb size too large"))?,
1160         )
1161     } else if matches!(cfg.protection_type, ProtectionType::Unprotected) {
1162         None
1163     } else {
1164         Some(64 * 1024 * 1024)
1165     };
1166 
1167     let (pflash_image, pflash_block_size) = if let Some(pflash_parameters) = &cfg.pflash_parameters
1168     {
1169         (
1170             Some(
1171                 open_file_or_duplicate(
1172                     &pflash_parameters.path,
1173                     OpenOptions::new().read(true).write(true),
1174                 )
1175                 .with_context(|| {
1176                     format!("failed to open pflash {}", pflash_parameters.path.display())
1177                 })?,
1178             ),
1179             pflash_parameters.block_size,
1180         )
1181     } else {
1182         (None, 0)
1183     };
1184 
1185     #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
1186     let mut cpu_frequencies = BTreeMap::new();
1187     #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
1188     let mut virt_cpufreq_socket = None;
1189 
1190     #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
1191     if cfg.virt_cpufreq {
1192         let host_cpu_frequencies = Arch::get_host_cpu_frequencies_khz()?;
1193 
1194         for cpu_id in 0..cfg.vcpu_count.unwrap_or(1) {
1195             let vcpu_affinity = match cfg.vcpu_affinity.clone() {
1196                 Some(VcpuAffinity::Global(v)) => v,
1197                 Some(VcpuAffinity::PerVcpu(mut m)) => m.remove(&cpu_id).unwrap_or_default(),
1198                 None => {
1199                     panic!("There must be some vcpu_affinity setting with VirtCpufreq enabled!")
1200                 }
1201             };
1202 
1203             // Check that the physical CPUs that the vCPU is affined to all share the same
1204             // frequency domain.
1205             if let Some(freq_domain) = host_cpu_frequencies.get(&vcpu_affinity[0]) {
1206                 for cpu in vcpu_affinity.iter() {
1207                     if let Some(frequencies) = host_cpu_frequencies.get(cpu) {
1208                         if frequencies != freq_domain {
1209                             panic!("Affined CPUs do not share a frequency domain!");
1210                         }
1211                     }
1212                 }
1213                 cpu_frequencies.insert(cpu_id, freq_domain.clone());
1214             } else {
1215                 panic!("No frequency domain for cpu:{}", cpu_id);
1216             }
1217         }
1218 
1219         virt_cpufreq_socket = if let Some(path) = &cfg.virt_cpufreq_socket {
1220             let file = base::open_file_or_duplicate(path, OpenOptions::new().write(true))
1221                 .with_context(|| {
1222                     format!("failed to open virt_cpufreq_socket {}", path.display())
1223                 })?;
1224             let fd: std::os::fd::OwnedFd = file.into();
1225             let socket: std::os::unix::net::UnixStream = fd.into();
1226             Some(socket)
1227         } else {
1228             None
1229         };
1230     }
1231 
1232     // if --enable-fw-cfg or --fw-cfg was given, we want to enable fw_cfg
1233     let fw_cfg_enable = cfg.enable_fw_cfg || !cfg.fw_cfg_parameters.is_empty();
1234     let (cpu_clusters, cpu_capacity) = if cfg.host_cpu_topology {
1235         (
1236             Arch::get_host_cpu_clusters()?,
1237             Arch::get_host_cpu_capacity()?,
1238         )
1239     } else {
1240         (cfg.cpu_clusters.clone(), cfg.cpu_capacity.clone())
1241     };
1242 
1243     Ok(VmComponents {
1244         #[cfg(target_arch = "x86_64")]
1245         ac_adapter: cfg.ac_adapter,
1246         #[cfg(target_arch = "x86_64")]
1247         break_linux_pci_config_io: cfg.break_linux_pci_config_io,
1248         memory_size: cfg
1249             .memory
1250             .unwrap_or(256)
1251             .checked_mul(1024 * 1024)
1252             .ok_or_else(|| anyhow!("requested memory size too large"))?,
1253         swiotlb,
1254         fw_cfg_enable,
1255         bootorder_fw_cfg_blob: Vec::new(),
1256         vcpu_count: cfg.vcpu_count.unwrap_or(1),
1257         vcpu_affinity: cfg.vcpu_affinity.clone(),
1258         #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
1259         cpu_frequencies,
1260         #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
1261         virt_cpufreq_socket,
1262         fw_cfg_parameters: cfg.fw_cfg_parameters.clone(),
1263         cpu_clusters,
1264         cpu_capacity,
1265         no_smt: cfg.no_smt,
1266         hugepages: cfg.hugepages,
1267         hv_cfg: hypervisor::Config {
1268             #[cfg(target_arch = "aarch64")]
1269             mte: cfg.mte,
1270             protection_type: cfg.protection_type,
1271         },
1272         vm_image,
1273         android_fstab: cfg
1274             .android_fstab
1275             .as_ref()
1276             .map(|x| {
1277                 File::open(x)
1278                     .with_context(|| format!("failed to open android fstab file {}", x.display()))
1279             })
1280             .map_or(Ok(None), |v| v.map(Some))?,
1281         pstore: cfg.pstore.clone(),
1282         pflash_block_size,
1283         pflash_image,
1284         initrd_image,
1285         extra_kernel_params: cfg.params.clone(),
1286         acpi_sdts: cfg
1287             .acpi_tables
1288             .iter()
1289             .map(|path| {
1290                 SDT::from_file(path)
1291                     .with_context(|| format!("failed to open ACPI file {}", path.display()))
1292             })
1293             .collect::<Result<Vec<SDT>>>()?,
1294         rt_cpus: cfg.rt_cpus.clone(),
1295         delay_rt: cfg.delay_rt,
1296         #[cfg(feature = "gdb")]
1297         gdb: None,
1298         no_i8042: cfg.no_i8042,
1299         no_rtc: cfg.no_rtc,
1300         #[cfg(target_arch = "x86_64")]
1301         smbios: cfg.smbios.clone(),
1302         host_cpu_topology: cfg.host_cpu_topology,
1303         itmt: cfg.itmt,
1304         #[cfg(target_arch = "x86_64")]
1305         force_s2idle: cfg.force_s2idle,
1306         pvm_fw: pvm_fw_image,
1307         #[cfg(target_arch = "x86_64")]
1308         pcie_ecam: cfg.pcie_ecam,
1309         #[cfg(target_arch = "x86_64")]
1310         pci_low_start: cfg.pci_low_start,
1311         dynamic_power_coefficient: cfg.dynamic_power_coefficient.clone(),
1312         boot_cpu: cfg.boot_cpu,
1313     })
1314 }
1315 
1316 #[derive(Copy, Clone, Debug, Eq, PartialEq)]
1317 pub enum ExitState {
1318     Reset,
1319     Stop,
1320     Crash,
1321     GuestPanic,
1322     WatchdogReset,
1323 }
1324 // Remove ranges in `guest_mem_layout` that overlap with ranges in `file_backed_mappings`.
1325 // Returns the updated guest memory layout.
punch_holes_in_guest_mem_layout_for_mappings( guest_mem_layout: Vec<(GuestAddress, u64, MemoryRegionOptions)>, file_backed_mappings: &[FileBackedMappingParameters], ) -> Vec<(GuestAddress, u64, MemoryRegionOptions)>1326 fn punch_holes_in_guest_mem_layout_for_mappings(
1327     guest_mem_layout: Vec<(GuestAddress, u64, MemoryRegionOptions)>,
1328     file_backed_mappings: &[FileBackedMappingParameters],
1329 ) -> Vec<(GuestAddress, u64, MemoryRegionOptions)> {
1330     // Create a set containing (start, end) pairs with exclusive end (end = start + size; the byte
1331     // at end is not included in the range).
1332     let mut layout_set = BTreeSet::new();
1333     for (addr, size, options) in &guest_mem_layout {
1334         layout_set.insert((addr.offset(), addr.offset() + size, *options));
1335     }
1336 
1337     for mapping in file_backed_mappings {
1338         let mapping_start = mapping.address;
1339         let mapping_end = mapping_start + mapping.size;
1340 
1341         // Repeatedly split overlapping guest memory regions until no overlaps remain.
1342         while let Some((range_start, range_end, options)) = layout_set
1343             .iter()
1344             .find(|&&(range_start, range_end, _)| {
1345                 mapping_start < range_end && mapping_end > range_start
1346             })
1347             .cloned()
1348         {
1349             layout_set.remove(&(range_start, range_end, options));
1350 
1351             if range_start < mapping_start {
1352                 layout_set.insert((range_start, mapping_start, options));
1353             }
1354             if range_end > mapping_end {
1355                 layout_set.insert((mapping_end, range_end, options));
1356             }
1357         }
1358     }
1359 
1360     // Build the final guest memory layout from the modified layout_set.
1361     layout_set
1362         .iter()
1363         .map(|(start, end, options)| (GuestAddress(*start), end - start, *options))
1364         .collect()
1365 }
1366 
create_guest_memory( cfg: &Config, components: &VmComponents, hypervisor: &impl Hypervisor, ) -> Result<GuestMemory>1367 fn create_guest_memory(
1368     cfg: &Config,
1369     components: &VmComponents,
1370     hypervisor: &impl Hypervisor,
1371 ) -> Result<GuestMemory> {
1372     let guest_mem_layout = Arch::guest_memory_layout(components, hypervisor)
1373         .context("failed to create guest memory layout")?;
1374 
1375     let guest_mem_layout =
1376         punch_holes_in_guest_mem_layout_for_mappings(guest_mem_layout, &cfg.file_backed_mappings);
1377 
1378     let guest_mem = GuestMemory::new_with_options(&guest_mem_layout)
1379         .context("failed to create guest memory")?;
1380     let mut mem_policy = MemoryPolicy::empty();
1381     if components.hugepages {
1382         mem_policy |= MemoryPolicy::USE_HUGEPAGES;
1383     }
1384 
1385     if cfg.lock_guest_memory {
1386         mem_policy |= MemoryPolicy::LOCK_GUEST_MEMORY;
1387     }
1388     guest_mem.set_memory_policy(mem_policy);
1389 
1390     if cfg.unmap_guest_memory_on_fork {
1391         // Note that this isn't compatible with sandboxing. We could potentially fix that by
1392         // delaying the call until after the sandboxed devices are forked. However, the main use
1393         // for this is in conjunction with protected VMs, where most of the guest memory has been
1394         // unshared with the host. We'd need to be confident that the guest memory is unshared with
1395         // the host only after the `use_dontfork` call and those details will vary by hypervisor.
1396         // So, for now we keep things simple to be safe.
1397         guest_mem.use_dontfork().context("use_dontfork failed")?;
1398     }
1399 
1400     Ok(guest_mem)
1401 }
1402 
1403 #[cfg(all(target_arch = "aarch64", feature = "geniezone"))]
run_gz(device_path: Option<&Path>, cfg: Config, components: VmComponents) -> Result<ExitState>1404 fn run_gz(device_path: Option<&Path>, cfg: Config, components: VmComponents) -> Result<ExitState> {
1405     use devices::GeniezoneKernelIrqChip;
1406     use hypervisor::geniezone::Geniezone;
1407     use hypervisor::geniezone::GeniezoneVcpu;
1408     use hypervisor::geniezone::GeniezoneVm;
1409 
1410     let device_path = device_path.unwrap_or(Path::new(GENIEZONE_PATH));
1411     let gzvm = Geniezone::new_with_path(device_path)
1412         .with_context(|| format!("failed to open GenieZone device {}", device_path.display()))?;
1413 
1414     let guest_mem = create_guest_memory(&cfg, &components, &gzvm)?;
1415 
1416     #[cfg(feature = "swap")]
1417     let swap_controller = if let Some(swap_dir) = cfg.swap_dir.as_ref() {
1418         Some(
1419             SwapController::launch(guest_mem.clone(), swap_dir, &cfg.jail_config)
1420                 .context("launch vmm-swap monitor process")?,
1421         )
1422     } else {
1423         None
1424     };
1425 
1426     let vm =
1427         GeniezoneVm::new(&gzvm, guest_mem, components.hv_cfg).context("failed to create vm")?;
1428 
1429     // Check that the VM was actually created in protected mode as expected.
1430     if cfg.protection_type.isolates_memory() && !vm.check_capability(VmCap::Protected) {
1431         bail!("Failed to create protected VM");
1432     }
1433     let vm_clone = vm.try_clone().context("failed to clone vm")?;
1434 
1435     let ioapic_host_tube;
1436     let mut irq_chip = match cfg.irq_chip.unwrap_or(IrqChipKind::Kernel) {
1437         IrqChipKind::Split => bail!("Geniezone does not support split irqchip mode"),
1438         IrqChipKind::Userspace => bail!("Geniezone does not support userspace irqchip mode"),
1439         IrqChipKind::Kernel => {
1440             ioapic_host_tube = None;
1441             GeniezoneKernelIrqChip::new(vm_clone, components.vcpu_count)
1442                 .context("failed to create IRQ chip")?
1443         }
1444     };
1445 
1446     run_vm::<GeniezoneVcpu, GeniezoneVm>(
1447         cfg,
1448         components,
1449         vm,
1450         &mut irq_chip,
1451         ioapic_host_tube,
1452         #[cfg(feature = "swap")]
1453         swap_controller,
1454     )
1455 }
1456 
run_kvm(device_path: Option<&Path>, cfg: Config, components: VmComponents) -> Result<ExitState>1457 fn run_kvm(device_path: Option<&Path>, cfg: Config, components: VmComponents) -> Result<ExitState> {
1458     use devices::KvmKernelIrqChip;
1459     #[cfg(target_arch = "x86_64")]
1460     use devices::KvmSplitIrqChip;
1461     use hypervisor::kvm::Kvm;
1462     use hypervisor::kvm::KvmVcpu;
1463     use hypervisor::kvm::KvmVm;
1464 
1465     let device_path = device_path.unwrap_or(Path::new(KVM_PATH));
1466     let kvm = Kvm::new_with_path(device_path)
1467         .with_context(|| format!("failed to open KVM device {}", device_path.display()))?;
1468 
1469     let guest_mem = create_guest_memory(&cfg, &components, &kvm)?;
1470 
1471     #[cfg(feature = "swap")]
1472     let swap_controller = if let Some(swap_dir) = cfg.swap_dir.as_ref() {
1473         Some(
1474             SwapController::launch(guest_mem.clone(), swap_dir, &cfg.jail_config)
1475                 .context("launch vmm-swap monitor process")?,
1476         )
1477     } else {
1478         None
1479     };
1480 
1481     let vm = KvmVm::new(&kvm, guest_mem, components.hv_cfg).context("failed to create vm")?;
1482 
1483     #[cfg(target_arch = "x86_64")]
1484     if cfg.itmt {
1485         vm.set_platform_info_read_access(false)
1486             .context("failed to disable MSR_PLATFORM_INFO read access")?;
1487     }
1488 
1489     // Check that the VM was actually created in protected mode as expected.
1490     if cfg.protection_type.isolates_memory() && !vm.check_capability(VmCap::Protected) {
1491         bail!("Failed to create protected VM");
1492     }
1493     let vm_clone = vm.try_clone().context("failed to clone vm")?;
1494 
1495     enum KvmIrqChip {
1496         #[cfg(target_arch = "x86_64")]
1497         Split(KvmSplitIrqChip),
1498         Kernel(KvmKernelIrqChip),
1499     }
1500 
1501     impl KvmIrqChip {
1502         fn as_mut(&mut self) -> &mut dyn IrqChipArch {
1503             match self {
1504                 #[cfg(target_arch = "x86_64")]
1505                 KvmIrqChip::Split(i) => i,
1506                 KvmIrqChip::Kernel(i) => i,
1507             }
1508         }
1509     }
1510 
1511     let ioapic_host_tube;
1512     let mut irq_chip = match cfg.irq_chip.unwrap_or(IrqChipKind::Kernel) {
1513         IrqChipKind::Userspace => {
1514             bail!("KVM userspace irqchip mode not implemented");
1515         }
1516         IrqChipKind::Split => {
1517             #[cfg(not(target_arch = "x86_64"))]
1518             bail!("KVM split irqchip mode only supported on x86 processors");
1519             #[cfg(target_arch = "x86_64")]
1520             {
1521                 let (host_tube, ioapic_device_tube) =
1522                     Tube::pair().context("failed to create tube")?;
1523                 ioapic_host_tube = Some(host_tube);
1524                 KvmIrqChip::Split(
1525                     KvmSplitIrqChip::new(
1526                         vm_clone,
1527                         components.vcpu_count,
1528                         ioapic_device_tube,
1529                         Some(24),
1530                     )
1531                     .context("failed to create IRQ chip")?,
1532                 )
1533             }
1534         }
1535         IrqChipKind::Kernel => {
1536             ioapic_host_tube = None;
1537             KvmIrqChip::Kernel(
1538                 KvmKernelIrqChip::new(vm_clone, components.vcpu_count)
1539                     .context("failed to create IRQ chip")?,
1540             )
1541         }
1542     };
1543 
1544     run_vm::<KvmVcpu, KvmVm>(
1545         cfg,
1546         components,
1547         vm,
1548         irq_chip.as_mut(),
1549         ioapic_host_tube,
1550         #[cfg(feature = "swap")]
1551         swap_controller,
1552     )
1553 }
1554 
1555 #[cfg(all(any(target_arch = "arm", target_arch = "aarch64"), feature = "gunyah"))]
run_gunyah( device_path: Option<&Path>, cfg: Config, components: VmComponents, ) -> Result<ExitState>1556 fn run_gunyah(
1557     device_path: Option<&Path>,
1558     cfg: Config,
1559     components: VmComponents,
1560 ) -> Result<ExitState> {
1561     use devices::GunyahIrqChip;
1562     use hypervisor::gunyah::Gunyah;
1563     use hypervisor::gunyah::GunyahVcpu;
1564     use hypervisor::gunyah::GunyahVm;
1565 
1566     let device_path = device_path.unwrap_or(Path::new(GUNYAH_PATH));
1567     let gunyah = Gunyah::new_with_path(device_path)
1568         .with_context(|| format!("failed to open Gunyah device {}", device_path.display()))?;
1569 
1570     let guest_mem = create_guest_memory(&cfg, &components, &gunyah)?;
1571 
1572     #[cfg(feature = "swap")]
1573     let swap_controller = if let Some(swap_dir) = cfg.swap_dir.as_ref() {
1574         Some(
1575             SwapController::launch(guest_mem.clone(), swap_dir, &cfg.jail_config)
1576                 .context("launch vmm-swap monitor process")?,
1577         )
1578     } else {
1579         None
1580     };
1581 
1582     let vm = GunyahVm::new(&gunyah, guest_mem, components.hv_cfg).context("failed to create vm")?;
1583 
1584     // Check that the VM was actually created in protected mode as expected.
1585     if cfg.protection_type.isolates_memory() && !vm.check_capability(VmCap::Protected) {
1586         bail!("Failed to create protected VM");
1587     }
1588 
1589     let vm_clone = vm.try_clone()?;
1590 
1591     run_vm::<GunyahVcpu, GunyahVm>(
1592         cfg,
1593         components,
1594         vm,
1595         &mut GunyahIrqChip::new(vm_clone)?,
1596         None,
1597         #[cfg(feature = "swap")]
1598         swap_controller,
1599     )
1600 }
1601 
1602 /// Choose a default hypervisor if no `--hypervisor` option was specified.
get_default_hypervisor() -> Option<HypervisorKind>1603 fn get_default_hypervisor() -> Option<HypervisorKind> {
1604     let kvm_path = Path::new(KVM_PATH);
1605     if kvm_path.exists() {
1606         return Some(HypervisorKind::Kvm {
1607             device: Some(kvm_path.to_path_buf()),
1608         });
1609     }
1610 
1611     #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
1612     #[cfg(feature = "geniezone")]
1613     {
1614         let gz_path = Path::new(GENIEZONE_PATH);
1615         if gz_path.exists() {
1616             return Some(HypervisorKind::Geniezone {
1617                 device: Some(gz_path.to_path_buf()),
1618             });
1619         }
1620     }
1621 
1622     #[cfg(all(
1623         unix,
1624         any(target_arch = "arm", target_arch = "aarch64"),
1625         feature = "gunyah"
1626     ))]
1627     {
1628         let gunyah_path = Path::new(GUNYAH_PATH);
1629         if gunyah_path.exists() {
1630             return Some(HypervisorKind::Gunyah {
1631                 device: Some(gunyah_path.to_path_buf()),
1632             });
1633         }
1634     }
1635 
1636     None
1637 }
1638 
run_config(cfg: Config) -> Result<ExitState>1639 pub fn run_config(cfg: Config) -> Result<ExitState> {
1640     if let Some(async_executor) = cfg.async_executor {
1641         Executor::set_default_executor_kind(async_executor)
1642             .context("Failed to set the default async executor")?;
1643     }
1644 
1645     let components = setup_vm_components(&cfg)?;
1646 
1647     let hypervisor = cfg
1648         .hypervisor
1649         .clone()
1650         .or_else(get_default_hypervisor)
1651         .context("no enabled hypervisor")?;
1652 
1653     debug!("creating hypervisor: {:?}", hypervisor);
1654 
1655     match hypervisor {
1656         HypervisorKind::Kvm { device } => run_kvm(device.as_deref(), cfg, components),
1657         #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
1658         #[cfg(feature = "geniezone")]
1659         HypervisorKind::Geniezone { device } => run_gz(device.as_deref(), cfg, components),
1660         #[cfg(all(
1661             unix,
1662             any(target_arch = "arm", target_arch = "aarch64"),
1663             feature = "gunyah"
1664         ))]
1665         HypervisorKind::Gunyah { device } => run_gunyah(device.as_deref(), cfg, components),
1666     }
1667 }
1668 
run_vm<Vcpu, V>( cfg: Config, #[allow(unused_mut)] mut components: VmComponents, mut vm: V, irq_chip: &mut dyn IrqChipArch, ioapic_host_tube: Option<Tube>, #[cfg(feature = "swap")] mut swap_controller: Option<SwapController>, ) -> Result<ExitState> where Vcpu: VcpuArch + 'static, V: VmArch + 'static,1669 fn run_vm<Vcpu, V>(
1670     cfg: Config,
1671     #[allow(unused_mut)] mut components: VmComponents,
1672     mut vm: V,
1673     irq_chip: &mut dyn IrqChipArch,
1674     ioapic_host_tube: Option<Tube>,
1675     #[cfg(feature = "swap")] mut swap_controller: Option<SwapController>,
1676 ) -> Result<ExitState>
1677 where
1678     Vcpu: VcpuArch + 'static,
1679     V: VmArch + 'static,
1680 {
1681     if cfg.jail_config.is_some() {
1682         // Printing something to the syslog before entering minijail so that libc's syslogger has a
1683         // chance to open files necessary for its operation, like `/etc/localtime`. After jailing,
1684         // access to those files will not be possible.
1685         info!("crosvm entering multiprocess mode");
1686     }
1687 
1688     let (metrics_send, metrics_recv) = Tube::directional_pair().context("metrics tube")?;
1689     metrics::initialize(metrics_send);
1690 
1691     #[cfg(all(feature = "pci-hotplug", feature = "swap"))]
1692     let swap_device_helper = match &swap_controller {
1693         Some(swap_controller) => Some(swap_controller.create_device_helper()?),
1694         None => None,
1695     };
1696     // pci-hotplug is only implemented for x86_64 for now, attempting to use it on other platform
1697     // would crash.
1698     #[cfg(all(feature = "pci-hotplug", not(target_arch = "x86_64")))]
1699     if cfg.pci_hotplug_slots.is_some() {
1700         bail!("pci-hotplug is not implemented for non x86_64 architecture");
1701     }
1702     // hotplug_manager must be created before vm is started since it forks jail warden process.
1703     #[cfg(feature = "pci-hotplug")]
1704     // TODO(293801301): Remove unused_mut after aarch64 support
1705     #[allow(unused_mut)]
1706     let mut hotplug_manager = if cfg.pci_hotplug_slots.is_some() {
1707         Some(PciHotPlugManager::new(
1708             vm.get_memory().clone(),
1709             &cfg,
1710             #[cfg(feature = "swap")]
1711             swap_device_helper,
1712         )?)
1713     } else {
1714         None
1715     };
1716 
1717     #[cfg(feature = "gpu")]
1718     let (gpu_control_host_tube, gpu_control_device_tube) =
1719         Tube::pair().context("failed to create gpu tube")?;
1720 
1721     #[cfg(feature = "usb")]
1722     let (usb_control_tube, usb_provider) =
1723         DeviceProvider::new().context("failed to create usb provider")?;
1724 
1725     // Masking signals is inherently dangerous, since this can persist across clones/execs. Do this
1726     // before any jailed devices have been spawned, so that we can catch any of them that fail very
1727     // quickly.
1728     let sigchld_fd = SignalFd::new(libc::SIGCHLD).context("failed to create signalfd")?;
1729 
1730     let control_server_socket = match &cfg.socket_path {
1731         Some(path) => Some(UnlinkUnixSeqpacketListener(
1732             UnixSeqpacketListener::bind(path).context("failed to create control server")?,
1733         )),
1734         None => None,
1735     };
1736 
1737     let mut control_tubes = Vec::new();
1738     let mut irq_control_tubes = Vec::new();
1739     let mut vm_memory_control_tubes = Vec::new();
1740 
1741     #[cfg(feature = "gdb")]
1742     if let Some(port) = cfg.gdb {
1743         // GDB needs a control socket to interrupt vcpus.
1744         let (gdb_host_tube, gdb_control_tube) = Tube::pair().context("failed to create tube")?;
1745         control_tubes.push(TaggedControlTube::Vm(gdb_host_tube));
1746         components.gdb = Some((port, gdb_control_tube));
1747     }
1748 
1749     #[cfg(feature = "balloon")]
1750     let (balloon_host_tube, balloon_device_tube) = if cfg.balloon {
1751         if let Some(ref path) = cfg.balloon_control {
1752             (
1753                 None,
1754                 Some(Tube::new_from_unix_seqpacket(
1755                     UnixSeqpacket::connect(path).with_context(|| {
1756                         format!(
1757                             "failed to connect to balloon control socket {}",
1758                             path.display(),
1759                         )
1760                     })?,
1761                 )?),
1762             )
1763         } else {
1764             // Balloon gets a special socket so balloon requests can be forwarded
1765             // from the main process.
1766             let (host, device) = Tube::pair().context("failed to create tube")?;
1767             (Some(host), Some(device))
1768         }
1769     } else {
1770         (None, None)
1771     };
1772 
1773     // The balloon device also needs a tube to communicate back to the main process to
1774     // handle remapping memory dynamically.
1775     #[cfg(feature = "balloon")]
1776     let dynamic_mapping_device_tube = if cfg.balloon {
1777         let (dynamic_mapping_host_tube, dynamic_mapping_device_tube) =
1778             Tube::pair().context("failed to create tube")?;
1779         vm_memory_control_tubes.push(VmMemoryTube {
1780             tube: dynamic_mapping_host_tube,
1781             expose_with_viommu: false,
1782         });
1783         Some(dynamic_mapping_device_tube)
1784     } else {
1785         None
1786     };
1787 
1788     // Create one control socket per disk.
1789     let mut disk_device_tubes = Vec::new();
1790     let mut disk_host_tubes = Vec::new();
1791     let disk_count = cfg.disks.len();
1792     for _ in 0..disk_count {
1793         let (disk_host_tub, disk_device_tube) = Tube::pair().context("failed to create tube")?;
1794         disk_host_tubes.push(disk_host_tub);
1795         disk_device_tubes.push(disk_device_tube);
1796     }
1797 
1798     let mut pmem_device_tubes = Vec::new();
1799     let pmem_count = cfg.pmem_devices.len();
1800     for _ in 0..pmem_count {
1801         let (pmem_host_tube, pmem_device_tube) = Tube::pair().context("failed to create tube")?;
1802         pmem_device_tubes.push(pmem_device_tube);
1803         control_tubes.push(TaggedControlTube::VmMsync(pmem_host_tube));
1804     }
1805 
1806     if let Some(ioapic_host_tube) = ioapic_host_tube {
1807         irq_control_tubes.push(ioapic_host_tube);
1808     }
1809 
1810     let battery = if cfg.battery_config.is_some() {
1811         #[cfg_attr(
1812             not(feature = "power-monitor-powerd"),
1813             allow(clippy::manual_map, clippy::needless_match, unused_mut)
1814         )]
1815         let jail = if let Some(jail_config) = &cfg.jail_config {
1816             let mut config = SandboxConfig::new(jail_config, "battery");
1817             #[cfg(feature = "power-monitor-powerd")]
1818             {
1819                 config.bind_mounts = true;
1820             }
1821             let mut jail =
1822                 create_sandbox_minijail(&jail_config.pivot_root, MAX_OPEN_FILES_DEFAULT, &config)?;
1823 
1824             // Setup a bind mount to the system D-Bus socket if the powerd monitor is used.
1825             #[cfg(feature = "power-monitor-powerd")]
1826             {
1827                 let system_bus_socket_path = Path::new("/run/dbus/system_bus_socket");
1828                 jail.mount_bind(system_bus_socket_path, system_bus_socket_path, true)?;
1829             }
1830             Some(jail)
1831         } else {
1832             None
1833         };
1834         (cfg.battery_config.as_ref().map(|c| c.type_), jail)
1835     } else {
1836         (cfg.battery_config.as_ref().map(|c| c.type_), None)
1837     };
1838 
1839     let fs_count = cfg
1840         .shared_dirs
1841         .iter()
1842         .filter(|sd| sd.kind == SharedDirKind::FS)
1843         .count();
1844     let mut fs_device_tubes = Vec::with_capacity(fs_count);
1845     for _ in 0..fs_count {
1846         let (fs_host_tube, fs_device_tube) = Tube::pair().context("failed to create tube")?;
1847         control_tubes.push(TaggedControlTube::Fs(fs_host_tube));
1848         fs_device_tubes.push(fs_device_tube);
1849     }
1850 
1851     let (vm_evt_wrtube, vm_evt_rdtube) =
1852         Tube::directional_pair().context("failed to create vm event tube")?;
1853 
1854     let pstore_size = components.pstore.as_ref().map(|pstore| pstore.size as u64);
1855     let mut sys_allocator = SystemAllocator::new(
1856         Arch::get_system_allocator_config(&vm),
1857         pstore_size,
1858         &cfg.mmio_address_ranges,
1859     )
1860     .context("failed to create system allocator")?;
1861 
1862     let ramoops_region = match &components.pstore {
1863         Some(pstore) => Some(
1864             arch::pstore::create_memory_region(
1865                 &mut vm,
1866                 sys_allocator.reserved_region().unwrap(),
1867                 pstore,
1868             )
1869             .context("failed to allocate pstore region")?,
1870         ),
1871         None => None,
1872     };
1873 
1874     create_file_backed_mappings(&cfg, &mut vm, &mut sys_allocator)?;
1875 
1876     #[cfg(feature = "gpu")]
1877     // Hold on to the render server jail so it keeps running until we exit run_vm()
1878     let (_render_server_jail, render_server_fd) =
1879         if let Some(parameters) = &cfg.gpu_render_server_parameters {
1880             let (jail, fd) = start_gpu_render_server(&cfg, parameters)?;
1881             (Some(ScopedMinijail(jail)), Some(fd))
1882         } else {
1883             (None, None)
1884         };
1885 
1886     #[cfg(feature = "balloon")]
1887     let init_balloon_size = components
1888         .memory_size
1889         .checked_sub(cfg.init_memory.map_or(components.memory_size, |m| {
1890             m.checked_mul(1024 * 1024).unwrap_or(u64::MAX)
1891         }))
1892         .context("failed to calculate init balloon size")?;
1893 
1894     let mut iommu_attached_endpoints: BTreeMap<u32, Arc<Mutex<Box<dyn MemoryMapperTrait>>>> =
1895         BTreeMap::new();
1896     let mut iova_max_addr: Option<u64> = None;
1897 
1898     // pvclock gets a tube for handling suspend/resume requests from the main thread.
1899     #[cfg(feature = "pvclock")]
1900     let (pvclock_host_tube, pvclock_device_tube) = if cfg.pvclock {
1901         let (host, device) = Tube::pair().context("failed to create tube")?;
1902         (Some(host), Some(device))
1903     } else {
1904         (None, None)
1905     };
1906     #[cfg(not(feature = "pvclock"))]
1907     if cfg.pvclock {
1908         bail!("pvclock device is only supported when crosvm is built with a feature 'pvclock'");
1909     }
1910 
1911     #[cfg(feature = "registered_events")]
1912     let (reg_evt_wrtube, reg_evt_rdtube) =
1913         Tube::directional_pair().context("failed to create registered event tube")?;
1914 
1915     let mut devices = create_devices(
1916         &cfg,
1917         &mut vm,
1918         &mut sys_allocator,
1919         &vm_evt_wrtube,
1920         &mut iommu_attached_endpoints,
1921         &mut irq_control_tubes,
1922         &mut vm_memory_control_tubes,
1923         &mut control_tubes,
1924         #[cfg(feature = "balloon")]
1925         balloon_device_tube,
1926         #[cfg(feature = "balloon")]
1927         init_balloon_size,
1928         #[cfg(feature = "balloon")]
1929         dynamic_mapping_device_tube,
1930         &mut disk_device_tubes,
1931         &mut pmem_device_tubes,
1932         &mut fs_device_tubes,
1933         #[cfg(feature = "usb")]
1934         usb_provider,
1935         #[cfg(feature = "gpu")]
1936         gpu_control_device_tube,
1937         #[cfg(feature = "gpu")]
1938         render_server_fd,
1939         &mut iova_max_addr,
1940         #[cfg(feature = "registered_events")]
1941         &reg_evt_wrtube,
1942         #[cfg(feature = "pvclock")]
1943         pvclock_device_tube,
1944     )?;
1945 
1946     #[cfg(feature = "pci-hotplug")]
1947     // TODO(293801301): Remove unused_variables after aarch64 support
1948     #[allow(unused_variables)]
1949     let pci_hotplug_slots = cfg.pci_hotplug_slots;
1950     #[cfg(not(feature = "pci-hotplug"))]
1951     #[allow(unused_variables)]
1952     let pci_hotplug_slots: Option<u8> = None;
1953     #[cfg(target_arch = "x86_64")]
1954     let hp_stub = create_pure_virtual_pcie_root_port(
1955         &mut sys_allocator,
1956         &mut irq_control_tubes,
1957         &mut devices,
1958         pci_hotplug_slots.unwrap_or(1),
1959     )?;
1960 
1961     arch::assign_pci_addresses(&mut devices, &mut sys_allocator)?;
1962 
1963     let pci_devices: Vec<&dyn PciDevice> = devices
1964         .iter()
1965         .filter_map(|d| (d.0).as_pci_device())
1966         .collect();
1967 
1968     let virtio_devices: Vec<(&dyn VirtioDevice, devices::PciAddress)> = pci_devices
1969         .into_iter()
1970         .flat_map(|s| {
1971             if let Some(virtio_pci_device) = s.as_virtio_pci_device() {
1972                 std::iter::zip(
1973                     Some(virtio_pci_device.virtio_device()),
1974                     virtio_pci_device.pci_address(),
1975                 )
1976                 .next()
1977             } else {
1978                 None
1979             }
1980         })
1981         .collect();
1982 
1983     let mut open_firmware_device_paths: Vec<(Vec<u8>, usize)> = virtio_devices
1984         .iter()
1985         .flat_map(|s| (s.0).bootorder_fw_cfg(s.1.dev))
1986         .collect();
1987 
1988     // order the OpenFirmware device paths, in ascending order, by their boot_index
1989     open_firmware_device_paths.sort_by(|a, b| (a.1).cmp(&(b.1)));
1990 
1991     // "/pci@iocf8/" is x86 specific and represents the root at the system bus port
1992     let mut bootorder_fw_cfg_blob =
1993         open_firmware_device_paths
1994             .into_iter()
1995             .fold(Vec::new(), |a, b| {
1996                 a.into_iter()
1997                     .chain("/pci@i0cf8/".as_bytes().iter().copied())
1998                     .chain(b.0)
1999                     .chain("\n".as_bytes().iter().copied())
2000                     .collect()
2001             });
2002 
2003     // the "bootorder" file is expected to end with a null terminator
2004     bootorder_fw_cfg_blob.push(0);
2005 
2006     components.bootorder_fw_cfg_blob = bootorder_fw_cfg_blob;
2007 
2008     // if the bootindex argument was given, we want to make sure that fw_cfg is enabled so the
2009     // "bootorder" file can be accessed by the guest.
2010     components.fw_cfg_enable |= components.bootorder_fw_cfg_blob.len() > 1;
2011 
2012     let (translate_response_senders, request_rx) = setup_virtio_access_platform(
2013         &mut sys_allocator,
2014         &mut iommu_attached_endpoints,
2015         &mut devices,
2016     )?;
2017 
2018     #[cfg(target_arch = "x86_64")]
2019     let iommu_bus_ranges = hp_stub.iommu_bus_ranges;
2020     #[cfg(not(target_arch = "x86_64"))]
2021     let iommu_bus_ranges = Vec::new();
2022 
2023     let iommu_host_tube = if !iommu_attached_endpoints.is_empty()
2024         || (cfg.vfio_isolate_hotplug && !iommu_bus_ranges.is_empty())
2025     {
2026         let (iommu_host_tube, iommu_device_tube) = Tube::pair().context("failed to create tube")?;
2027         let iommu_dev = create_iommu_device(
2028             cfg.protection_type,
2029             &cfg.jail_config,
2030             iova_max_addr.unwrap_or(u64::MAX),
2031             iommu_attached_endpoints,
2032             iommu_bus_ranges,
2033             translate_response_senders,
2034             request_rx,
2035             iommu_device_tube,
2036         )?;
2037 
2038         let (msi_host_tube, msi_device_tube) = Tube::pair().context("failed to create tube")?;
2039         irq_control_tubes.push(msi_host_tube);
2040         let (ioevent_host_tube, ioevent_device_tube) =
2041             Tube::pair().context("failed to create ioevent tube")?;
2042         vm_memory_control_tubes.push(VmMemoryTube {
2043             tube: ioevent_host_tube,
2044             expose_with_viommu: false,
2045         });
2046         let (host_tube, device_tube) =
2047             Tube::pair().context("failed to create device control tube")?;
2048         control_tubes.push(TaggedControlTube::Vm(host_tube));
2049         let mut dev = VirtioPciDevice::new(
2050             vm.get_memory().clone(),
2051             iommu_dev.dev,
2052             msi_device_tube,
2053             cfg.disable_virtio_intx,
2054             None,
2055             VmMemoryClient::new(ioevent_device_tube),
2056             device_tube,
2057         )
2058         .context("failed to create virtio pci dev")?;
2059         // early reservation for viommu.
2060         dev.allocate_address(&mut sys_allocator)
2061             .context("failed to allocate resources early for virtio pci dev")?;
2062         let dev = Box::new(dev);
2063         devices.push((dev, iommu_dev.jail));
2064         Some(iommu_host_tube)
2065     } else {
2066         None
2067     };
2068 
2069     #[cfg(target_arch = "x86_64")]
2070     for device in devices
2071         .iter_mut()
2072         .filter_map(|(dev, _)| dev.as_pci_device_mut())
2073     {
2074         let sdts = device
2075             .generate_acpi(components.acpi_sdts)
2076             .or_else(|| {
2077                 error!("ACPI table generation error");
2078                 None
2079             })
2080             .ok_or_else(|| anyhow!("failed to generate ACPI table"))?;
2081         components.acpi_sdts = sdts;
2082     }
2083 
2084     // KVM_CREATE_VCPU uses apic id for x86 and uses cpu id for others.
2085     let mut vcpu_ids = Vec::new();
2086 
2087     let guest_suspended_cvar = if cfg.force_s2idle {
2088         Some(Arc::new((Mutex::new(false), Condvar::new())))
2089     } else {
2090         None
2091     };
2092 
2093     let dt_overlays = cfg
2094         .device_tree_overlay
2095         .iter()
2096         .map(|o| {
2097             Ok(DtbOverlay {
2098                 file: open_file_or_duplicate(o.path.as_path(), OpenOptions::new().read(true))
2099                     .with_context(|| {
2100                         format!("failed to open device tree overlay {}", o.path.display())
2101                     })?,
2102                 do_filter: o.filter_devs,
2103             })
2104         })
2105         .collect::<Result<Vec<DtbOverlay>>>()?;
2106 
2107     let mut linux = Arch::build_vm::<V, Vcpu>(
2108         components,
2109         &vm_evt_wrtube,
2110         &mut sys_allocator,
2111         &cfg.serial_parameters,
2112         simple_jail(&cfg.jail_config, "serial_device")?,
2113         battery,
2114         vm,
2115         ramoops_region,
2116         devices,
2117         irq_chip,
2118         &mut vcpu_ids,
2119         cfg.dump_device_tree_blob.clone(),
2120         simple_jail(&cfg.jail_config, "serial_device")?,
2121         #[cfg(target_arch = "x86_64")]
2122         simple_jail(&cfg.jail_config, "block_device")?,
2123         #[cfg(target_arch = "x86_64")]
2124         simple_jail(&cfg.jail_config, "fw_cfg_device")?,
2125         #[cfg(feature = "swap")]
2126         &mut swap_controller,
2127         guest_suspended_cvar.clone(),
2128         dt_overlays,
2129     )
2130     .context("the architecture failed to build the vm")?;
2131 
2132     if let Some(tube) = linux.vm_request_tube.take() {
2133         control_tubes.push(TaggedControlTube::Vm(tube));
2134     }
2135 
2136     #[cfg(target_arch = "x86_64")]
2137     let (hp_control_tube, hp_worker_tube) = mpsc::channel();
2138     #[cfg(all(feature = "pci-hotplug", target_arch = "x86_64"))]
2139     if let Some(hotplug_manager) = &mut hotplug_manager {
2140         hotplug_manager.set_rootbus_controller(hp_control_tube.clone())?;
2141     }
2142     #[cfg(target_arch = "x86_64")]
2143     let hp_thread = {
2144         for (bus_num, hp_bus) in hp_stub.hotplug_buses.into_iter() {
2145             #[cfg(feature = "pci-hotplug")]
2146             if let Some(hotplug_manager) = &mut hotplug_manager {
2147                 hotplug_manager.add_port(hp_bus)?;
2148             } else {
2149                 linux.hotplug_bus.insert(bus_num, hp_bus);
2150             }
2151             #[cfg(not(feature = "pci-hotplug"))]
2152             linux.hotplug_bus.insert(bus_num, hp_bus);
2153         }
2154 
2155         if let Some(pm) = &linux.pm {
2156             for (gpe, notify_dev) in hp_stub.gpe_notify_devs.into_iter() {
2157                 pm.lock().register_gpe_notify_dev(gpe, notify_dev);
2158             }
2159             for (bus, notify_dev) in hp_stub.pme_notify_devs.into_iter() {
2160                 pm.lock().register_pme_notify_dev(bus, notify_dev);
2161             }
2162         }
2163 
2164         let (hp_vm_mem_host_tube, hp_vm_mem_worker_tube) =
2165             Tube::pair().context("failed to create tube")?;
2166         vm_memory_control_tubes.push(VmMemoryTube {
2167             tube: hp_vm_mem_host_tube,
2168             expose_with_viommu: false,
2169         });
2170 
2171         let supports_readonly_mapping = linux.vm.supports_readonly_mapping();
2172         let pci_root = linux.root_config.clone();
2173         std::thread::Builder::new()
2174             .name("pci_root".to_string())
2175             .spawn(move || {
2176                 start_pci_root_worker(
2177                     supports_readonly_mapping,
2178                     pci_root,
2179                     hp_worker_tube,
2180                     hp_vm_mem_worker_tube,
2181                 )
2182             })?
2183     };
2184 
2185     let flags = RutabagaGrallocBackendFlags::new().disable_vulkano();
2186     let gralloc = RutabagaGralloc::new(flags).context("failed to create gralloc")?;
2187 
2188     run_control(
2189         linux,
2190         sys_allocator,
2191         cfg,
2192         control_server_socket,
2193         irq_control_tubes,
2194         vm_memory_control_tubes,
2195         control_tubes,
2196         #[cfg(feature = "balloon")]
2197         balloon_host_tube,
2198         &disk_host_tubes,
2199         #[cfg(feature = "gpu")]
2200         gpu_control_host_tube,
2201         #[cfg(feature = "usb")]
2202         usb_control_tube,
2203         vm_evt_rdtube,
2204         vm_evt_wrtube,
2205         sigchld_fd,
2206         gralloc,
2207         vcpu_ids,
2208         iommu_host_tube,
2209         #[cfg(target_arch = "x86_64")]
2210         hp_control_tube,
2211         #[cfg(target_arch = "x86_64")]
2212         hp_thread,
2213         #[cfg(feature = "pci-hotplug")]
2214         hotplug_manager,
2215         #[cfg(feature = "swap")]
2216         swap_controller,
2217         #[cfg(feature = "registered_events")]
2218         reg_evt_rdtube,
2219         guest_suspended_cvar,
2220         #[cfg(feature = "pvclock")]
2221         pvclock_host_tube,
2222         metrics_recv,
2223     )
2224 }
2225 
2226 // Hotplug command is facing dead lock issue when it tries to acquire the lock
2227 // for pci root in the vm control thread. Dead lock could happen when the vm
2228 // control thread(Thread A namely) is handling the hotplug command and it tries
2229 // to get the lock for pci root. However, the lock is already hold by another
2230 // device in thread B, which is actively sending an vm control to be handled by
2231 // thread A and waiting for response. However, thread A is blocked on acquiring
2232 // the lock, so dead lock happens. In order to resolve this issue, we add this
2233 // worker thread and push all work that locks pci root to this thread.
2234 #[cfg(target_arch = "x86_64")]
start_pci_root_worker( supports_readonly_mapping: bool, pci_root: Arc<Mutex<PciRoot>>, hp_device_tube: mpsc::Receiver<PciRootCommand>, vm_control_tube: Tube, )2235 fn start_pci_root_worker(
2236     supports_readonly_mapping: bool,
2237     pci_root: Arc<Mutex<PciRoot>>,
2238     hp_device_tube: mpsc::Receiver<PciRootCommand>,
2239     vm_control_tube: Tube,
2240 ) {
2241     struct PciMmioMapperTube {
2242         supports_readonly_mapping: bool,
2243         vm_control_tube: Tube,
2244         registered_regions: BTreeMap<u32, VmMemoryRegionId>,
2245         next_id: u32,
2246     }
2247 
2248     impl PciMmioMapper for PciMmioMapperTube {
2249         fn supports_readonly_mapping(&self) -> bool {
2250             self.supports_readonly_mapping
2251         }
2252 
2253         fn add_mapping(&mut self, addr: GuestAddress, shmem: &SharedMemory) -> anyhow::Result<u32> {
2254             let shmem = shmem
2255                 .try_clone()
2256                 .context("failed to create new SharedMemory")?;
2257             self.vm_control_tube
2258                 .send(&VmMemoryRequest::RegisterMemory {
2259                     source: VmMemorySource::SharedMemory(shmem),
2260                     dest: VmMemoryDestination::GuestPhysicalAddress(addr.0),
2261                     prot: Protection::read(),
2262                     cache: MemCacheType::CacheCoherent,
2263                 })
2264                 .context("failed to send request")?;
2265             match self.vm_control_tube.recv::<VmMemoryResponse>() {
2266                 Ok(VmMemoryResponse::RegisterMemory(slot)) => {
2267                     let cur_id = self.next_id;
2268                     self.registered_regions.insert(cur_id, slot);
2269                     self.next_id += 1;
2270                     Ok(cur_id)
2271                 }
2272                 res => bail!("Bad response: {:?}", res),
2273             }
2274         }
2275     }
2276 
2277     let mut mapper = PciMmioMapperTube {
2278         supports_readonly_mapping,
2279         vm_control_tube,
2280         registered_regions: BTreeMap::new(),
2281         next_id: 0,
2282     };
2283 
2284     loop {
2285         match hp_device_tube.recv() {
2286             Ok(cmd) => match cmd {
2287                 PciRootCommand::Add(addr, device) => {
2288                     if let Err(e) = pci_root.lock().add_device(addr, device, &mut mapper) {
2289                         error!("failed to add hotplugged device to PCI root port: {}", e);
2290                     }
2291                 }
2292                 PciRootCommand::AddBridge(pci_bus) => {
2293                     if let Err(e) = pci_root.lock().add_bridge(pci_bus) {
2294                         error!("failed to add hotplugged bridge to PCI root port: {}", e);
2295                     }
2296                 }
2297                 PciRootCommand::Remove(addr) => {
2298                     pci_root.lock().remove_device(addr);
2299                 }
2300                 PciRootCommand::Kill => break,
2301             },
2302             Err(e) => {
2303                 error!("Error: pci root worker channel closed: {}", e);
2304                 break;
2305             }
2306         }
2307     }
2308 }
2309 
2310 #[cfg(target_arch = "x86_64")]
get_hp_bus<V: VmArch, Vcpu: VcpuArch>( linux: &RunnableLinuxVm<V, Vcpu>, host_addr: PciAddress, ) -> Result<Arc<Mutex<dyn HotPlugBus>>>2311 fn get_hp_bus<V: VmArch, Vcpu: VcpuArch>(
2312     linux: &RunnableLinuxVm<V, Vcpu>,
2313     host_addr: PciAddress,
2314 ) -> Result<Arc<Mutex<dyn HotPlugBus>>> {
2315     for (_, hp_bus) in linux.hotplug_bus.iter() {
2316         if hp_bus.lock().is_match(host_addr).is_some() {
2317             return Ok(hp_bus.clone());
2318         }
2319     }
2320     Err(anyhow!("Failed to find a suitable hotplug bus"))
2321 }
2322 
2323 #[cfg(target_arch = "x86_64")]
add_hotplug_device<V: VmArch, Vcpu: VcpuArch>( linux: &mut RunnableLinuxVm<V, Vcpu>, sys_allocator: &mut SystemAllocator, cfg: &Config, irq_control_tubes: &mut Vec<Tube>, vm_memory_control_tubes: &mut Vec<VmMemoryTube>, control_tubes: &mut Vec<TaggedControlTube>, hp_control_tube: &mpsc::Sender<PciRootCommand>, iommu_host_tube: Option<&Tube>, device: &HotPlugDeviceInfo, #[cfg(feature = "swap")] swap_controller: &mut Option<SwapController>, ) -> Result<()>2324 fn add_hotplug_device<V: VmArch, Vcpu: VcpuArch>(
2325     linux: &mut RunnableLinuxVm<V, Vcpu>,
2326     sys_allocator: &mut SystemAllocator,
2327     cfg: &Config,
2328     irq_control_tubes: &mut Vec<Tube>,
2329     vm_memory_control_tubes: &mut Vec<VmMemoryTube>,
2330     control_tubes: &mut Vec<TaggedControlTube>,
2331     hp_control_tube: &mpsc::Sender<PciRootCommand>,
2332     iommu_host_tube: Option<&Tube>,
2333     device: &HotPlugDeviceInfo,
2334     #[cfg(feature = "swap")] swap_controller: &mut Option<SwapController>,
2335 ) -> Result<()> {
2336     let host_addr = PciAddress::from_path(&device.path)
2337         .context("failed to parse hotplug device's PCI address")?;
2338     let hp_bus = get_hp_bus(linux, host_addr)?;
2339 
2340     let (hotplug_key, pci_address) = match device.device_type {
2341         HotPlugDeviceType::UpstreamPort | HotPlugDeviceType::DownstreamPort => {
2342             let (vm_host_tube, vm_device_tube) = Tube::pair().context("failed to create tube")?;
2343             control_tubes.push(TaggedControlTube::Vm(vm_host_tube));
2344             let (msi_host_tube, msi_device_tube) = Tube::pair().context("failed to create tube")?;
2345             irq_control_tubes.push(msi_host_tube);
2346             let pcie_host = PcieHostPort::new(device.path.as_path(), vm_device_tube)?;
2347             let (hotplug_key, pci_bridge) = match device.device_type {
2348                 HotPlugDeviceType::UpstreamPort => {
2349                     let hotplug_key = HotPlugKey::HostUpstreamPort { host_addr };
2350                     let pcie_upstream_port = Arc::new(Mutex::new(PcieUpstreamPort::new_from_host(
2351                         pcie_host, true,
2352                     )?));
2353                     let pci_bridge =
2354                         Box::new(PciBridge::new(pcie_upstream_port.clone(), msi_device_tube));
2355                     linux
2356                         .hotplug_bus
2357                         .insert(pci_bridge.get_secondary_num(), pcie_upstream_port);
2358                     (hotplug_key, pci_bridge)
2359                 }
2360                 HotPlugDeviceType::DownstreamPort => {
2361                     let hotplug_key = HotPlugKey::HostDownstreamPort { host_addr };
2362                     let pcie_downstream_port = Arc::new(Mutex::new(
2363                         PcieDownstreamPort::new_from_host(pcie_host, true)?,
2364                     ));
2365                     let pci_bridge = Box::new(PciBridge::new(
2366                         pcie_downstream_port.clone(),
2367                         msi_device_tube,
2368                     ));
2369                     linux
2370                         .hotplug_bus
2371                         .insert(pci_bridge.get_secondary_num(), pcie_downstream_port);
2372                     (hotplug_key, pci_bridge)
2373                 }
2374                 _ => {
2375                     bail!("Impossible to reach here")
2376                 }
2377             };
2378             let pci_address = Arch::register_pci_device(
2379                 linux,
2380                 pci_bridge,
2381                 None,
2382                 sys_allocator,
2383                 hp_control_tube,
2384                 #[cfg(feature = "swap")]
2385                 swap_controller,
2386             )?;
2387 
2388             (hotplug_key, pci_address)
2389         }
2390         HotPlugDeviceType::EndPoint => {
2391             let hotplug_key = HotPlugKey::HostVfio { host_addr };
2392             let (vfio_device, jail, viommu_mapper) = create_vfio_device(
2393                 &cfg.jail_config,
2394                 &linux.vm,
2395                 sys_allocator,
2396                 irq_control_tubes,
2397                 vm_memory_control_tubes,
2398                 control_tubes,
2399                 &device.path,
2400                 true,
2401                 None,
2402                 None,
2403                 None,
2404                 if iommu_host_tube.is_some() {
2405                     IommuDevType::VirtioIommu
2406                 } else {
2407                     IommuDevType::NoIommu
2408                 },
2409                 None,
2410             )?;
2411             let vfio_pci_device = match vfio_device {
2412                 VfioDeviceVariant::Pci(pci) => Box::new(pci),
2413                 VfioDeviceVariant::Platform(_) => bail!("vfio platform hotplug not supported"),
2414             };
2415             let pci_address = Arch::register_pci_device(
2416                 linux,
2417                 vfio_pci_device,
2418                 jail,
2419                 sys_allocator,
2420                 hp_control_tube,
2421                 #[cfg(feature = "swap")]
2422                 swap_controller,
2423             )?;
2424             if let Some(iommu_host_tube) = iommu_host_tube {
2425                 let endpoint_addr = pci_address.to_u32();
2426                 let vfio_wrapper = viommu_mapper.context("expected mapper")?;
2427                 let descriptor = vfio_wrapper.clone_as_raw_descriptor()?;
2428                 let request =
2429                     VirtioIOMMURequest::VfioCommand(VirtioIOMMUVfioCommand::VfioDeviceAdd {
2430                         endpoint_addr,
2431                         wrapper_id: vfio_wrapper.id(),
2432                         container: {
2433                             // SAFETY:
2434                             // Safe because the descriptor is uniquely owned by `descriptor`.
2435                             unsafe { File::from_raw_descriptor(descriptor) }
2436                         },
2437                     });
2438                 match virtio_iommu_request(iommu_host_tube, &request)
2439                     .map_err(|_| VirtioIOMMUVfioError::SocketFailed)?
2440                 {
2441                     VirtioIOMMUResponse::VfioResponse(VirtioIOMMUVfioResult::Ok) => (),
2442                     resp => bail!("Unexpected message response: {:?}", resp),
2443                 }
2444             }
2445 
2446             (hotplug_key, pci_address)
2447         }
2448     };
2449     hp_bus.lock().add_hotplug_device(hotplug_key, pci_address);
2450     if device.hp_interrupt {
2451         hp_bus.lock().hot_plug(pci_address)?;
2452     }
2453     Ok(())
2454 }
2455 
2456 #[cfg(feature = "pci-hotplug")]
add_hotplug_net<V: VmArch, Vcpu: VcpuArch>( linux: &mut RunnableLinuxVm<V, Vcpu>, sys_allocator: &mut SystemAllocator, irq_control_tubes: &mut Vec<Tube>, vm_memory_control_tubes: &mut Vec<VmMemoryTube>, vm_control_tubes: &mut Vec<TaggedControlTube>, hotplug_manager: &mut PciHotPlugManager, net_param: NetParameters, ) -> Result<u8>2457 fn add_hotplug_net<V: VmArch, Vcpu: VcpuArch>(
2458     linux: &mut RunnableLinuxVm<V, Vcpu>,
2459     sys_allocator: &mut SystemAllocator,
2460     irq_control_tubes: &mut Vec<Tube>,
2461     vm_memory_control_tubes: &mut Vec<VmMemoryTube>,
2462     vm_control_tubes: &mut Vec<TaggedControlTube>,
2463     hotplug_manager: &mut PciHotPlugManager,
2464     net_param: NetParameters,
2465 ) -> Result<u8> {
2466     let (msi_host_tube, msi_device_tube) = Tube::pair().context("create tube")?;
2467     irq_control_tubes.push(msi_host_tube);
2468     let (ioevent_host_tube, ioevent_device_tube) = Tube::pair().context("create tube")?;
2469     let ioevent_vm_memory_client = VmMemoryClient::new(ioevent_device_tube);
2470     vm_memory_control_tubes.push(VmMemoryTube {
2471         tube: ioevent_host_tube,
2472         expose_with_viommu: false,
2473     });
2474     let (vm_control_host_tube, vm_control_device_tube) = Tube::pair().context("create tube")?;
2475     vm_control_tubes.push(TaggedControlTube::Vm(vm_control_host_tube));
2476     let net_carrier_device = NetResourceCarrier::new(
2477         net_param,
2478         msi_device_tube,
2479         ioevent_vm_memory_client,
2480         vm_control_device_tube,
2481     );
2482     hotplug_manager.hotplug_device(
2483         vec![ResourceCarrier::VirtioNet(net_carrier_device)],
2484         linux,
2485         sys_allocator,
2486     )
2487 }
2488 
2489 #[cfg(feature = "pci-hotplug")]
handle_hotplug_net_command<V: VmArch, Vcpu: VcpuArch>( net_cmd: NetControlCommand, linux: &mut RunnableLinuxVm<V, Vcpu>, sys_allocator: &mut SystemAllocator, irq_control_tubes: &mut Vec<Tube>, vm_memory_control_tubes: &mut Vec<VmMemoryTube>, vm_control_tubes: &mut Vec<TaggedControlTube>, hotplug_manager: &mut PciHotPlugManager, ) -> VmResponse2490 fn handle_hotplug_net_command<V: VmArch, Vcpu: VcpuArch>(
2491     net_cmd: NetControlCommand,
2492     linux: &mut RunnableLinuxVm<V, Vcpu>,
2493     sys_allocator: &mut SystemAllocator,
2494     irq_control_tubes: &mut Vec<Tube>,
2495     vm_memory_control_tubes: &mut Vec<VmMemoryTube>,
2496     vm_control_tubes: &mut Vec<TaggedControlTube>,
2497     hotplug_manager: &mut PciHotPlugManager,
2498 ) -> VmResponse {
2499     match net_cmd {
2500         NetControlCommand::AddTap(tap_name) => handle_hotplug_net_add(
2501             linux,
2502             sys_allocator,
2503             irq_control_tubes,
2504             vm_memory_control_tubes,
2505             vm_control_tubes,
2506             hotplug_manager,
2507             &tap_name,
2508         ),
2509         NetControlCommand::RemoveTap(bus) => {
2510             handle_hotplug_net_remove(linux, sys_allocator, hotplug_manager, bus)
2511         }
2512     }
2513 }
2514 
2515 #[cfg(feature = "pci-hotplug")]
handle_hotplug_net_add<V: VmArch, Vcpu: VcpuArch>( linux: &mut RunnableLinuxVm<V, Vcpu>, sys_allocator: &mut SystemAllocator, irq_control_tubes: &mut Vec<Tube>, vm_memory_control_tubes: &mut Vec<VmMemoryTube>, vm_control_tubes: &mut Vec<TaggedControlTube>, hotplug_manager: &mut PciHotPlugManager, tap_name: &str, ) -> VmResponse2516 fn handle_hotplug_net_add<V: VmArch, Vcpu: VcpuArch>(
2517     linux: &mut RunnableLinuxVm<V, Vcpu>,
2518     sys_allocator: &mut SystemAllocator,
2519     irq_control_tubes: &mut Vec<Tube>,
2520     vm_memory_control_tubes: &mut Vec<VmMemoryTube>,
2521     vm_control_tubes: &mut Vec<TaggedControlTube>,
2522     hotplug_manager: &mut PciHotPlugManager,
2523     tap_name: &str,
2524 ) -> VmResponse {
2525     let net_param_mode = NetParametersMode::TapName {
2526         tap_name: tap_name.to_owned(),
2527         mac: None,
2528     };
2529     let net_param = NetParameters {
2530         mode: net_param_mode,
2531         vhost_net: None,
2532         vq_pairs: None,
2533         packed_queue: false,
2534         pci_address: None,
2535     };
2536     let ret = add_hotplug_net(
2537         linux,
2538         sys_allocator,
2539         irq_control_tubes,
2540         vm_memory_control_tubes,
2541         vm_control_tubes,
2542         hotplug_manager,
2543         net_param,
2544     );
2545 
2546     match ret {
2547         Ok(pci_bus) => VmResponse::PciHotPlugResponse { bus: pci_bus },
2548         Err(e) => VmResponse::ErrString(format!("{:?}", e)),
2549     }
2550 }
2551 
2552 #[cfg(feature = "pci-hotplug")]
handle_hotplug_net_remove<V: VmArch, Vcpu: VcpuArch>( linux: &mut RunnableLinuxVm<V, Vcpu>, sys_allocator: &mut SystemAllocator, hotplug_manager: &mut PciHotPlugManager, bus: u8, ) -> VmResponse2553 fn handle_hotplug_net_remove<V: VmArch, Vcpu: VcpuArch>(
2554     linux: &mut RunnableLinuxVm<V, Vcpu>,
2555     sys_allocator: &mut SystemAllocator,
2556     hotplug_manager: &mut PciHotPlugManager,
2557     bus: u8,
2558 ) -> VmResponse {
2559     match hotplug_manager.remove_hotplug_device(bus, linux, sys_allocator) {
2560         Ok(_) => VmResponse::Ok,
2561         Err(e) => VmResponse::ErrString(format!("{:?}", e)),
2562     }
2563 }
2564 
2565 #[cfg(target_arch = "x86_64")]
remove_hotplug_bridge<V: VmArch, Vcpu: VcpuArch>( linux: &RunnableLinuxVm<V, Vcpu>, sys_allocator: &mut SystemAllocator, buses_to_remove: &mut Vec<u8>, hotplug_key: HotPlugKey, child_bus: u8, ) -> Result<()>2566 fn remove_hotplug_bridge<V: VmArch, Vcpu: VcpuArch>(
2567     linux: &RunnableLinuxVm<V, Vcpu>,
2568     sys_allocator: &mut SystemAllocator,
2569     buses_to_remove: &mut Vec<u8>,
2570     hotplug_key: HotPlugKey,
2571     child_bus: u8,
2572 ) -> Result<()> {
2573     for (bus_num, hp_bus) in linux.hotplug_bus.iter() {
2574         let mut hp_bus_lock = hp_bus.lock();
2575         if let Some(pci_addr) = hp_bus_lock.get_hotplug_device(hotplug_key) {
2576             sys_allocator.release_pci(pci_addr.bus, pci_addr.dev, pci_addr.func);
2577             hp_bus_lock.hot_unplug(pci_addr)?;
2578             buses_to_remove.push(child_bus);
2579             if hp_bus_lock.is_empty() {
2580                 if let Some(hotplug_key) = hp_bus_lock.get_hotplug_key() {
2581                     remove_hotplug_bridge(
2582                         linux,
2583                         sys_allocator,
2584                         buses_to_remove,
2585                         hotplug_key,
2586                         *bus_num,
2587                     )?;
2588                 }
2589             }
2590             return Ok(());
2591         }
2592     }
2593 
2594     Err(anyhow!(
2595         "Can not find device {:?} on hotplug buses",
2596         hotplug_key
2597     ))
2598 }
2599 
2600 #[cfg(target_arch = "x86_64")]
remove_hotplug_device<V: VmArch, Vcpu: VcpuArch>( linux: &mut RunnableLinuxVm<V, Vcpu>, sys_allocator: &mut SystemAllocator, iommu_host_tube: Option<&Tube>, device: &HotPlugDeviceInfo, ) -> Result<()>2601 fn remove_hotplug_device<V: VmArch, Vcpu: VcpuArch>(
2602     linux: &mut RunnableLinuxVm<V, Vcpu>,
2603     sys_allocator: &mut SystemAllocator,
2604     iommu_host_tube: Option<&Tube>,
2605     device: &HotPlugDeviceInfo,
2606 ) -> Result<()> {
2607     let host_addr = PciAddress::from_path(&device.path)?;
2608     let hotplug_key = match device.device_type {
2609         HotPlugDeviceType::UpstreamPort => HotPlugKey::HostUpstreamPort { host_addr },
2610         HotPlugDeviceType::DownstreamPort => HotPlugKey::HostDownstreamPort { host_addr },
2611         HotPlugDeviceType::EndPoint => HotPlugKey::HostVfio { host_addr },
2612     };
2613 
2614     let hp_bus = linux
2615         .hotplug_bus
2616         .iter()
2617         .find(|(_, hp_bus)| {
2618             let hp_bus = hp_bus.lock();
2619             hp_bus.get_hotplug_device(hotplug_key).is_some()
2620         })
2621         .map(|(bus_num, hp_bus)| (*bus_num, hp_bus.clone()));
2622 
2623     if let Some((bus_num, hp_bus)) = hp_bus {
2624         let mut buses_to_remove = Vec::new();
2625         let mut removed_key = None;
2626         let mut hp_bus_lock = hp_bus.lock();
2627         if let Some(pci_addr) = hp_bus_lock.get_hotplug_device(hotplug_key) {
2628             if let Some(iommu_host_tube) = iommu_host_tube {
2629                 let request =
2630                     VirtioIOMMURequest::VfioCommand(VirtioIOMMUVfioCommand::VfioDeviceDel {
2631                         endpoint_addr: pci_addr.to_u32(),
2632                     });
2633                 match virtio_iommu_request(iommu_host_tube, &request)
2634                     .map_err(|_| VirtioIOMMUVfioError::SocketFailed)?
2635                 {
2636                     VirtioIOMMUResponse::VfioResponse(VirtioIOMMUVfioResult::Ok) => (),
2637                     resp => bail!("Unexpected message response: {:?}", resp),
2638                 }
2639             }
2640             let mut empty_simbling = true;
2641             if let Some(HotPlugKey::HostDownstreamPort { host_addr }) =
2642                 hp_bus_lock.get_hotplug_key()
2643             {
2644                 let addr_alias = host_addr;
2645                 for (simbling_bus_num, hp_bus) in linux.hotplug_bus.iter() {
2646                     if *simbling_bus_num != bus_num {
2647                         let hp_bus_lock = hp_bus.lock();
2648                         let hotplug_key = hp_bus_lock.get_hotplug_key();
2649                         if let Some(HotPlugKey::HostDownstreamPort { host_addr }) = hotplug_key {
2650                             if addr_alias.bus == host_addr.bus && !hp_bus_lock.is_empty() {
2651                                 empty_simbling = false;
2652                                 break;
2653                             }
2654                         }
2655                     }
2656                 }
2657             }
2658 
2659             // If all simbling downstream ports are empty, do not send hot unplug event for this
2660             // downstream port. Root port will send one plug out interrupt and remove all
2661             // the remaining devices
2662             if !empty_simbling {
2663                 hp_bus_lock.hot_unplug(pci_addr)?;
2664             }
2665 
2666             sys_allocator.release_pci(pci_addr.bus, pci_addr.dev, pci_addr.func);
2667             if empty_simbling || hp_bus_lock.is_empty() {
2668                 if let Some(hotplug_key) = hp_bus_lock.get_hotplug_key() {
2669                     removed_key = Some(hotplug_key);
2670                     remove_hotplug_bridge(
2671                         linux,
2672                         sys_allocator,
2673                         &mut buses_to_remove,
2674                         hotplug_key,
2675                         bus_num,
2676                     )?;
2677                 }
2678             }
2679         }
2680 
2681         // Some types of TBT device has a few empty downstream ports. The emulated bridges
2682         // of these ports won't be removed since no vfio device is connected to our emulated
2683         // bridges. So we explicitly check all simbling bridges of the removed bridge here,
2684         // and remove them if bridge has no child device connected.
2685         if let Some(HotPlugKey::HostDownstreamPort { host_addr }) = removed_key {
2686             let addr_alias = host_addr;
2687             for (simbling_bus_num, hp_bus) in linux.hotplug_bus.iter() {
2688                 if *simbling_bus_num != bus_num {
2689                     let hp_bus_lock = hp_bus.lock();
2690                     let hotplug_key = hp_bus_lock.get_hotplug_key();
2691                     if let Some(HotPlugKey::HostDownstreamPort { host_addr }) = hotplug_key {
2692                         if addr_alias.bus == host_addr.bus && hp_bus_lock.is_empty() {
2693                             remove_hotplug_bridge(
2694                                 linux,
2695                                 sys_allocator,
2696                                 &mut buses_to_remove,
2697                                 hotplug_key.unwrap(),
2698                                 *simbling_bus_num,
2699                             )?;
2700                         }
2701                     }
2702                 }
2703             }
2704         }
2705         for bus in buses_to_remove.iter() {
2706             linux.hotplug_bus.remove(bus);
2707         }
2708         return Ok(());
2709     }
2710 
2711     Err(anyhow!(
2712         "Can not find device {:?} on hotplug buses",
2713         hotplug_key
2714     ))
2715 }
2716 
trigger_vm_suspend_and_wait_for_entry( guest_suspended_cvar: Arc<(Mutex<bool>, Condvar)>, tube: &SendTube, response: vm_control::VmResponse, suspend_evt: Event, pm: Option<Arc<Mutex<dyn PmResource + Send>>>, )2717 pub fn trigger_vm_suspend_and_wait_for_entry(
2718     guest_suspended_cvar: Arc<(Mutex<bool>, Condvar)>,
2719     tube: &SendTube,
2720     response: vm_control::VmResponse,
2721     suspend_evt: Event,
2722     pm: Option<Arc<Mutex<dyn PmResource + Send>>>,
2723 ) {
2724     let (lock, cvar) = &*guest_suspended_cvar;
2725     let mut guest_suspended = lock.lock();
2726 
2727     *guest_suspended = false;
2728 
2729     // During suspend also emulate sleepbtn, which allows to suspend VM (if running e.g. acpid and
2730     // reacts on sleep button events)
2731     if let Some(pm) = pm {
2732         pm.lock().slpbtn_evt();
2733     } else {
2734         error!("generating sleepbtn during suspend not supported");
2735     }
2736 
2737     // Wait for notification about guest suspension, if not received after 15sec,
2738     // proceed anyway.
2739     let result = cvar.wait_timeout(guest_suspended, std::time::Duration::from_secs(15));
2740     guest_suspended = result.0;
2741 
2742     if result.1.timed_out() {
2743         warn!("Guest suspension timeout - proceeding anyway");
2744     } else if *guest_suspended {
2745         info!("Guest suspended");
2746     }
2747 
2748     if let Err(e) = suspend_evt.signal() {
2749         error!("failed to trigger suspend event: {}", e);
2750     }
2751     // Now we ready to send response over the tube and communicate that VM suspend has finished
2752     if let Err(e) = tube.send(&response) {
2753         error!("failed to send VmResponse: {}", e);
2754     }
2755 }
2756 
2757 #[cfg(feature = "pvclock")]
send_pvclock_cmd(tube: &Tube, command: PvClockCommand) -> Result<()>2758 fn send_pvclock_cmd(tube: &Tube, command: PvClockCommand) -> Result<()> {
2759     tube.send(&command)
2760         .with_context(|| format!("failed to send pvclock command {:?}", command))?;
2761     let resp = tube
2762         .recv::<PvClockCommandResponse>()
2763         .context("failed to receive pvclock command response")?;
2764     if let PvClockCommandResponse::Err(e) = resp {
2765         bail!("pvclock encountered error on {:?}: {}", command, e);
2766     }
2767     if let PvClockCommandResponse::DeviceInactive = resp {
2768         warn!("Tried to send {command:?} but pvclock device was inactive");
2769     } else {
2770         info!("{command:?} completed with {resp:?}");
2771     }
2772     Ok(())
2773 }
2774 
2775 #[cfg(target_arch = "x86_64")]
handle_hotplug_command<V: VmArch, Vcpu: VcpuArch>( linux: &mut RunnableLinuxVm<V, Vcpu>, sys_allocator: &mut SystemAllocator, cfg: &Config, add_irq_control_tubes: &mut Vec<Tube>, add_vm_memory_control_tubes: &mut Vec<VmMemoryTube>, add_tubes: &mut Vec<TaggedControlTube>, hp_control_tube: &mpsc::Sender<PciRootCommand>, iommu_host_tube: Option<&Tube>, device: &HotPlugDeviceInfo, add: bool, #[cfg(feature = "swap")] swap_controller: &mut Option<SwapController>, ) -> VmResponse2776 fn handle_hotplug_command<V: VmArch, Vcpu: VcpuArch>(
2777     linux: &mut RunnableLinuxVm<V, Vcpu>,
2778     sys_allocator: &mut SystemAllocator,
2779     cfg: &Config,
2780     add_irq_control_tubes: &mut Vec<Tube>,
2781     add_vm_memory_control_tubes: &mut Vec<VmMemoryTube>,
2782     add_tubes: &mut Vec<TaggedControlTube>,
2783     hp_control_tube: &mpsc::Sender<PciRootCommand>,
2784     iommu_host_tube: Option<&Tube>,
2785     device: &HotPlugDeviceInfo,
2786     add: bool,
2787     #[cfg(feature = "swap")] swap_controller: &mut Option<SwapController>,
2788 ) -> VmResponse {
2789     let iommu_host_tube = if cfg.vfio_isolate_hotplug {
2790         iommu_host_tube
2791     } else {
2792         None
2793     };
2794 
2795     let ret = if add {
2796         add_hotplug_device(
2797             linux,
2798             sys_allocator,
2799             cfg,
2800             add_irq_control_tubes,
2801             add_vm_memory_control_tubes,
2802             add_tubes,
2803             hp_control_tube,
2804             iommu_host_tube,
2805             device,
2806             #[cfg(feature = "swap")]
2807             swap_controller,
2808         )
2809     } else {
2810         remove_hotplug_device(linux, sys_allocator, iommu_host_tube, device)
2811     };
2812 
2813     match ret {
2814         Ok(()) => VmResponse::Ok,
2815         Err(e) => {
2816             error!("hanlde_hotplug_command failure: {}", e);
2817             add_tubes.clear();
2818             VmResponse::Err(base::Error::new(libc::EINVAL))
2819         }
2820     }
2821 }
2822 
2823 struct ControlLoopState<'a, V: VmArch, Vcpu: VcpuArch> {
2824     linux: &'a mut RunnableLinuxVm<V, Vcpu>,
2825     cfg: &'a Config,
2826     sys_allocator: &'a Arc<Mutex<SystemAllocator>>,
2827     control_tubes: &'a BTreeMap<usize, TaggedControlTube>,
2828     disk_host_tubes: &'a [Tube],
2829     #[cfg(feature = "gpu")]
2830     gpu_control_tube: &'a Tube,
2831     #[cfg(feature = "usb")]
2832     usb_control_tube: &'a Tube,
2833     #[cfg(target_arch = "x86_64")]
2834     iommu_host_tube: &'a Option<Arc<Mutex<Tube>>>,
2835     #[cfg(target_arch = "x86_64")]
2836     hp_control_tube: &'a mpsc::Sender<PciRootCommand>,
2837     guest_suspended_cvar: &'a Option<Arc<(Mutex<bool>, Condvar)>>,
2838     #[cfg(feature = "pci-hotplug")]
2839     hotplug_manager: &'a mut Option<PciHotPlugManager>,
2840     #[cfg(feature = "swap")]
2841     swap_controller: &'a mut Option<SwapController>,
2842     vcpu_handles: &'a [(JoinHandle<()>, mpsc::Sender<vm_control::VcpuControl>)],
2843     #[cfg(feature = "balloon")]
2844     balloon_tube: Option<&'a mut BalloonTube>,
2845     device_ctrl_tube: &'a Tube,
2846     irq_handler_control: &'a Tube,
2847     #[cfg(any(target_arch = "x86_64", feature = "pci-hotplug"))]
2848     vm_memory_handler_control: &'a Tube,
2849     #[cfg(feature = "registered_events")]
2850     registered_evt_tubes: &'a mut HashMap<RegisteredEvent, HashSet<AddressedProtoTube>>,
2851     #[cfg(feature = "pvclock")]
2852     pvclock_host_tube: Option<Arc<Tube>>,
2853 }
2854 
process_vm_request<V: VmArch + 'static, Vcpu: VcpuArch + 'static>( state: &mut ControlLoopState<V, Vcpu>, id: usize, tube: &Tube, request: VmRequest, #[cfg_attr( not(any(target_arch = "x86_64", feature = "pci-hotplug")), allow(unused_variables, clippy::ptr_arg) )] add_tubes: &mut Vec<TaggedControlTube>, ) -> Result<(Option<VmResponse>, bool, Option<VmRunMode>)>2855 fn process_vm_request<V: VmArch + 'static, Vcpu: VcpuArch + 'static>(
2856     state: &mut ControlLoopState<V, Vcpu>,
2857     id: usize,
2858     tube: &Tube,
2859     request: VmRequest,
2860     #[cfg_attr(
2861         not(any(target_arch = "x86_64", feature = "pci-hotplug")),
2862         allow(unused_variables, clippy::ptr_arg)
2863     )]
2864     add_tubes: &mut Vec<TaggedControlTube>,
2865 ) -> Result<(Option<VmResponse>, bool, Option<VmRunMode>)> {
2866     let mut suspend_requested = false;
2867     let mut run_mode_opt = None;
2868 
2869     #[cfg(any(target_arch = "x86_64", feature = "pci-hotplug"))]
2870     let mut add_irq_control_tubes = Vec::new();
2871     #[cfg(any(target_arch = "x86_64", feature = "pci-hotplug"))]
2872     let mut add_vm_memory_control_tubes = Vec::new();
2873 
2874     let response = match request {
2875         VmRequest::HotPlugVfioCommand { device, add } => {
2876             #[cfg(target_arch = "x86_64")]
2877             {
2878                 handle_hotplug_command(
2879                     state.linux,
2880                     &mut state.sys_allocator.lock(),
2881                     state.cfg,
2882                     &mut add_irq_control_tubes,
2883                     &mut add_vm_memory_control_tubes,
2884                     add_tubes,
2885                     state.hp_control_tube,
2886                     state.iommu_host_tube.as_ref().map(|t| t.lock()).as_deref(),
2887                     &device,
2888                     add,
2889                     #[cfg(feature = "swap")]
2890                     state.swap_controller,
2891                 )
2892             }
2893 
2894             #[cfg(not(target_arch = "x86_64"))]
2895             {
2896                 // Suppress warnings.
2897                 let _ = (device, add);
2898                 VmResponse::Ok
2899             }
2900         }
2901         #[cfg(feature = "pci-hotplug")]
2902         VmRequest::HotPlugNetCommand(net_cmd) => {
2903             if let Some(hotplug_manager) = state.hotplug_manager.as_mut() {
2904                 handle_hotplug_net_command(
2905                     net_cmd,
2906                     state.linux,
2907                     &mut state.sys_allocator.lock(),
2908                     &mut add_irq_control_tubes,
2909                     &mut add_vm_memory_control_tubes,
2910                     add_tubes,
2911                     hotplug_manager,
2912                 )
2913             } else {
2914                 VmResponse::ErrString("PCI hotplug is not enabled.".to_owned())
2915             }
2916         }
2917         #[cfg(feature = "registered_events")]
2918         VmRequest::RegisterListener { socket_addr, event } => {
2919             let (registered_tube, already_registered) =
2920                 find_registered_tube(state.registered_evt_tubes, &socket_addr, event);
2921 
2922             if !already_registered {
2923                 let addr_tube = make_addr_tube_from_maybe_existing(registered_tube, socket_addr)?;
2924 
2925                 if let Some(tubes) = state.registered_evt_tubes.get_mut(&event) {
2926                     tubes.insert(addr_tube);
2927                 } else {
2928                     state
2929                         .registered_evt_tubes
2930                         .insert(event, vec![addr_tube].into_iter().collect());
2931                 }
2932             }
2933             VmResponse::Ok
2934         }
2935         #[cfg(feature = "registered_events")]
2936         VmRequest::UnregisterListener { socket_addr, event } => {
2937             if let Some(tubes) = state.registered_evt_tubes.get_mut(&event) {
2938                 tubes.retain(|t| t.socket_addr != socket_addr);
2939             }
2940             state
2941                 .registered_evt_tubes
2942                 .retain(|_, tubes| !tubes.is_empty());
2943             VmResponse::Ok
2944         }
2945         #[cfg(feature = "registered_events")]
2946         VmRequest::Unregister { socket_addr } => {
2947             for (_, tubes) in state.registered_evt_tubes.iter_mut() {
2948                 tubes.retain(|t| t.socket_addr != socket_addr);
2949             }
2950             state
2951                 .registered_evt_tubes
2952                 .retain(|_, tubes| !tubes.is_empty());
2953             VmResponse::Ok
2954         }
2955         #[cfg(feature = "balloon")]
2956         VmRequest::BalloonCommand(cmd) => {
2957             if let Some(tube) = state.balloon_tube.as_mut() {
2958                 let Some((r, key)) = tube.send_cmd(cmd, Some(id)) else {
2959                     return Ok((None, false, None));
2960                 };
2961                 if key != id {
2962                     let Some(TaggedControlTube::Vm(tube)) = state.control_tubes.get(&key) else {
2963                         return Ok((None, false, None));
2964                     };
2965                     if let Err(e) = tube.send(&r) {
2966                         error!("failed to send VmResponse: {}", e);
2967                     }
2968                     return Ok((None, false, None));
2969                 }
2970                 r
2971             } else {
2972                 VmResponse::Err(base::Error::new(libc::ENOTSUP))
2973             }
2974         }
2975         _ => {
2976             let response = request.execute(
2977                 &state.linux.vm,
2978                 &mut run_mode_opt,
2979                 state.disk_host_tubes,
2980                 &mut state.linux.pm,
2981                 #[cfg(feature = "gpu")]
2982                 Some(state.gpu_control_tube),
2983                 #[cfg(not(feature = "gpu"))]
2984                 None,
2985                 #[cfg(feature = "usb")]
2986                 Some(state.usb_control_tube),
2987                 #[cfg(not(feature = "usb"))]
2988                 None,
2989                 &mut state.linux.bat_control,
2990                 |msg| {
2991                     vcpu::kick_all_vcpus(
2992                         state.vcpu_handles,
2993                         state.linux.irq_chip.as_irq_chip(),
2994                         msg,
2995                     )
2996                 },
2997                 state.cfg.force_s2idle,
2998                 #[cfg(feature = "swap")]
2999                 state.swap_controller.as_ref(),
3000                 state.device_ctrl_tube,
3001                 state.vcpu_handles.len(),
3002                 state.irq_handler_control,
3003                 || state.linux.irq_chip.snapshot(state.linux.vcpu_count),
3004             );
3005             if state.cfg.force_s2idle {
3006                 if let VmRequest::SuspendVcpus = request {
3007                     suspend_requested = true;
3008 
3009                     // Spawn s2idle wait thread.
3010                     let send_tube = tube.try_clone_send_tube().unwrap();
3011                     let suspend_evt = state.linux.suspend_evt.try_clone().unwrap();
3012                     let guest_suspended_cvar = state.guest_suspended_cvar.clone();
3013                     let delayed_response = response.clone();
3014                     let pm = state.linux.pm.clone();
3015 
3016                     std::thread::Builder::new()
3017                         .name("s2idle_wait".to_owned())
3018                         .spawn(move || {
3019                             trigger_vm_suspend_and_wait_for_entry(
3020                                 guest_suspended_cvar.unwrap(),
3021                                 &send_tube,
3022                                 delayed_response,
3023                                 suspend_evt,
3024                                 pm,
3025                             )
3026                         })
3027                         .context("failed to spawn s2idle_wait thread")?;
3028                 }
3029             } else {
3030                 // if not doing s2idle, the guest clock should
3031                 // behave as the host does, so let the guest
3032                 // know about the suspend / resume via
3033                 // virtio-pvclock.
3034                 #[cfg(feature = "pvclock")]
3035                 if let Some(ref pvclock_host_tube) = state.pvclock_host_tube {
3036                     let cmd = match request {
3037                         VmRequest::SuspendVcpus => Some(PvClockCommand::Suspend),
3038                         VmRequest::ResumeVcpus => Some(PvClockCommand::Resume),
3039                         _ => None,
3040                     };
3041                     if let Some(cmd) = cmd {
3042                         if let Err(e) = send_pvclock_cmd(pvclock_host_tube, cmd.clone()) {
3043                             error!("{:?} command failed: {:#}", cmd, e);
3044                         } else {
3045                             info!("{:?} command successfully processed", cmd);
3046                         }
3047                     }
3048                 }
3049             }
3050             response
3051         }
3052     };
3053 
3054     cfg_if::cfg_if! {
3055         if #[cfg(any(target_arch = "x86_64", feature = "pci-hotplug"))] {
3056             if !add_irq_control_tubes.is_empty() {
3057                 state
3058                     .irq_handler_control
3059                     .send(&IrqHandlerRequest::AddIrqControlTubes(
3060                         add_irq_control_tubes,
3061                     ))?;
3062             }
3063             if !add_vm_memory_control_tubes.is_empty() {
3064                 state
3065                     .vm_memory_handler_control
3066                     .send(&VmMemoryHandlerRequest::AddControlTubes(
3067                         add_vm_memory_control_tubes,
3068                     ))?;
3069             }
3070         }
3071     }
3072 
3073     Ok((Some(response), suspend_requested, run_mode_opt))
3074 }
3075 
process_vm_control_event<V: VmArch + 'static, Vcpu: VcpuArch + 'static>( state: &mut ControlLoopState<V, Vcpu>, id: usize, socket: &TaggedControlTube, ) -> Result<(bool, Vec<usize>, Vec<TaggedControlTube>)>3076 fn process_vm_control_event<V: VmArch + 'static, Vcpu: VcpuArch + 'static>(
3077     state: &mut ControlLoopState<V, Vcpu>,
3078     id: usize,
3079     socket: &TaggedControlTube,
3080 ) -> Result<(bool, Vec<usize>, Vec<TaggedControlTube>)> {
3081     let mut vm_control_ids_to_remove = Vec::new();
3082     let mut add_tubes = Vec::new();
3083     match socket {
3084         TaggedControlTube::Vm(tube) => match tube.recv::<VmRequest>() {
3085             Ok(request) => {
3086                 let (response, suspend_requested, run_mode_opt) =
3087                     process_vm_request(state, id, tube, request, &mut add_tubes)?;
3088 
3089                 if let Some(response) = response {
3090                     // If suspend requested skip that step since it will be
3091                     // performed by s2idle_wait thread when suspension actually
3092                     // happens.
3093                     if !suspend_requested {
3094                         if let Err(e) = tube.send(&response) {
3095                             error!("failed to send VmResponse: {}", e);
3096                         }
3097                     }
3098                 }
3099 
3100                 if let Some(run_mode) = run_mode_opt {
3101                     info!("control socket changed run mode to {}", run_mode);
3102                     match run_mode {
3103                         VmRunMode::Exiting => {
3104                             return Ok((true, Vec::new(), Vec::new()));
3105                         }
3106                         other => {
3107                             if other == VmRunMode::Running {
3108                                 for dev in &state.linux.resume_notify_devices {
3109                                     dev.lock().resume_imminent();
3110                                 }
3111                             }
3112                             // If suspend requested skip that step since it
3113                             // will be performed by s2idle_wait thread when
3114                             // needed.
3115                             if !suspend_requested {
3116                                 vcpu::kick_all_vcpus(
3117                                     state.vcpu_handles,
3118                                     state.linux.irq_chip.as_irq_chip(),
3119                                     VcpuControl::RunState(other),
3120                                 );
3121                             }
3122                         }
3123                     }
3124                 }
3125             }
3126             Err(e) => {
3127                 if let TubeError::Disconnected = e {
3128                     vm_control_ids_to_remove.push(id);
3129                 } else {
3130                     error!("failed to recv VmRequest: {}", e);
3131                 }
3132             }
3133         },
3134         TaggedControlTube::VmMsync(tube) => match tube.recv::<VmMsyncRequest>() {
3135             Ok(request) => {
3136                 let response = request.execute(&mut state.linux.vm);
3137                 if let Err(e) = tube.send(&response) {
3138                     error!("failed to send VmMsyncResponse: {}", e);
3139                 }
3140             }
3141             Err(e) => {
3142                 if let TubeError::Disconnected = e {
3143                     vm_control_ids_to_remove.push(id);
3144                 } else {
3145                     error!("failed to recv VmMsyncRequest: {}", e);
3146                 }
3147             }
3148         },
3149         TaggedControlTube::Fs(tube) => match tube.recv::<FsMappingRequest>() {
3150             Ok(request) => {
3151                 let response =
3152                     request.execute(&mut state.linux.vm, &mut state.sys_allocator.lock());
3153                 if let Err(e) = tube.send(&response) {
3154                     error!("failed to send VmResponse: {}", e);
3155                 }
3156             }
3157             Err(e) => {
3158                 if let TubeError::Disconnected = e {
3159                     vm_control_ids_to_remove.push(id);
3160                 } else {
3161                     error!("failed to recv VmResponse: {}", e);
3162                 }
3163             }
3164         },
3165     }
3166 
3167     Ok((false, vm_control_ids_to_remove, add_tubes))
3168 }
3169 
3170 #[cfg(feature = "registered_events")]
3171 struct AddressedProtoTube {
3172     tube: Rc<ProtoTube>,
3173     socket_addr: String,
3174 }
3175 
3176 #[cfg(feature = "registered_events")]
3177 impl PartialEq for AddressedProtoTube {
eq(&self, other: &Self) -> bool3178     fn eq(&self, other: &Self) -> bool {
3179         self.socket_addr == other.socket_addr
3180     }
3181 }
3182 
3183 #[cfg(feature = "registered_events")]
3184 impl Eq for AddressedProtoTube {}
3185 
3186 #[cfg(feature = "registered_events")]
3187 impl Hash for AddressedProtoTube {
hash<H: std::hash::Hasher>(&self, state: &mut H)3188     fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
3189         self.socket_addr.hash(state);
3190     }
3191 }
3192 
3193 #[cfg(feature = "registered_events")]
3194 impl AddressedProtoTube {
send<M: protobuf::Message>(&self, msg: &M) -> Result<(), base::TubeError>3195     pub fn send<M: protobuf::Message>(&self, msg: &M) -> Result<(), base::TubeError> {
3196         self.tube.send_proto(msg)
3197     }
3198 }
3199 
3200 #[cfg(feature = "registered_events")]
find_registered_tube<'a>( registered_tubes: &'a HashMap<RegisteredEvent, HashSet<AddressedProtoTube>>, socket_addr: &str, event: RegisteredEvent, ) -> (Option<&'a Rc<ProtoTube>>, bool)3201 fn find_registered_tube<'a>(
3202     registered_tubes: &'a HashMap<RegisteredEvent, HashSet<AddressedProtoTube>>,
3203     socket_addr: &str,
3204     event: RegisteredEvent,
3205 ) -> (Option<&'a Rc<ProtoTube>>, bool) {
3206     let mut registered_tube: Option<&Rc<ProtoTube>> = None;
3207     let mut already_registered = false;
3208     'outer: for (evt, addr_tubes) in registered_tubes {
3209         for addr_tube in addr_tubes {
3210             if addr_tube.socket_addr == socket_addr {
3211                 if *evt == event {
3212                     already_registered = true;
3213                     break 'outer;
3214                 }
3215                 // Since all tubes of the same addr should
3216                 // be an RC to the same tube, it doesn't
3217                 // matter which one we get. But we do need
3218                 // to check for a registration for the
3219                 // current event, so can't break here.
3220                 registered_tube = Some(&addr_tube.tube);
3221             }
3222         }
3223     }
3224     (registered_tube, already_registered)
3225 }
3226 
3227 #[cfg(feature = "registered_events")]
make_addr_tube_from_maybe_existing( tube: Option<&Rc<ProtoTube>>, addr: String, ) -> Result<AddressedProtoTube>3228 fn make_addr_tube_from_maybe_existing(
3229     tube: Option<&Rc<ProtoTube>>,
3230     addr: String,
3231 ) -> Result<AddressedProtoTube> {
3232     if let Some(registered_tube) = tube {
3233         Ok(AddressedProtoTube {
3234             tube: registered_tube.clone(),
3235             socket_addr: addr,
3236         })
3237     } else {
3238         let sock = UnixSeqpacket::connect(addr.clone()).with_context(|| {
3239             format!("failed to connect to registered listening socket {}", addr)
3240         })?;
3241         let tube = ProtoTube::new_from_unix_seqpacket(sock)?;
3242         Ok(AddressedProtoTube {
3243             tube: Rc::new(tube),
3244             socket_addr: addr,
3245         })
3246     }
3247 }
3248 
run_control<V: VmArch + 'static, Vcpu: VcpuArch + 'static>( mut linux: RunnableLinuxVm<V, Vcpu>, sys_allocator: SystemAllocator, cfg: Config, control_server_socket: Option<UnlinkUnixSeqpacketListener>, irq_control_tubes: Vec<Tube>, vm_memory_control_tubes: Vec<VmMemoryTube>, control_tubes: Vec<TaggedControlTube>, #[cfg(feature = "balloon")] balloon_host_tube: Option<Tube>, disk_host_tubes: &[Tube], #[cfg(feature = "gpu")] gpu_control_tube: Tube, #[cfg(feature = "usb")] usb_control_tube: Tube, vm_evt_rdtube: RecvTube, vm_evt_wrtube: SendTube, sigchld_fd: SignalFd, gralloc: RutabagaGralloc, vcpu_ids: Vec<usize>, iommu_host_tube: Option<Tube>, #[cfg(target_arch = "x86_64")] hp_control_tube: mpsc::Sender<PciRootCommand>, #[cfg(target_arch = "x86_64")] hp_thread: std::thread::JoinHandle<()>, #[cfg(feature = "pci-hotplug")] mut hotplug_manager: Option<PciHotPlugManager>, #[allow(unused_mut)] #[cfg(feature = "swap")] mut swap_controller: Option<SwapController>, #[cfg(feature = "registered_events")] reg_evt_rdtube: RecvTube, guest_suspended_cvar: Option<Arc<(Mutex<bool>, Condvar)>>, #[cfg(feature = "pvclock")] pvclock_host_tube: Option<Tube>, metrics_tube: RecvTube, ) -> Result<ExitState>3249 fn run_control<V: VmArch + 'static, Vcpu: VcpuArch + 'static>(
3250     mut linux: RunnableLinuxVm<V, Vcpu>,
3251     sys_allocator: SystemAllocator,
3252     cfg: Config,
3253     control_server_socket: Option<UnlinkUnixSeqpacketListener>,
3254     irq_control_tubes: Vec<Tube>,
3255     vm_memory_control_tubes: Vec<VmMemoryTube>,
3256     control_tubes: Vec<TaggedControlTube>,
3257     #[cfg(feature = "balloon")] balloon_host_tube: Option<Tube>,
3258     disk_host_tubes: &[Tube],
3259     #[cfg(feature = "gpu")] gpu_control_tube: Tube,
3260     #[cfg(feature = "usb")] usb_control_tube: Tube,
3261     vm_evt_rdtube: RecvTube,
3262     vm_evt_wrtube: SendTube,
3263     sigchld_fd: SignalFd,
3264     gralloc: RutabagaGralloc,
3265     vcpu_ids: Vec<usize>,
3266     iommu_host_tube: Option<Tube>,
3267     #[cfg(target_arch = "x86_64")] hp_control_tube: mpsc::Sender<PciRootCommand>,
3268     #[cfg(target_arch = "x86_64")] hp_thread: std::thread::JoinHandle<()>,
3269     #[cfg(feature = "pci-hotplug")] mut hotplug_manager: Option<PciHotPlugManager>,
3270     #[allow(unused_mut)] // mut is required x86 only
3271     #[cfg(feature = "swap")]
3272     mut swap_controller: Option<SwapController>,
3273     #[cfg(feature = "registered_events")] reg_evt_rdtube: RecvTube,
3274     guest_suspended_cvar: Option<Arc<(Mutex<bool>, Condvar)>>,
3275     #[cfg(feature = "pvclock")] pvclock_host_tube: Option<Tube>,
3276     metrics_tube: RecvTube,
3277 ) -> Result<ExitState> {
3278     #[derive(EventToken)]
3279     enum Token {
3280         VmEvent,
3281         Suspend,
3282         ChildSignal,
3283         VmControlServer,
3284         VmControl {
3285             id: usize,
3286         },
3287         #[cfg(feature = "registered_events")]
3288         RegisteredEvent,
3289         #[cfg(feature = "balloon")]
3290         BalloonTube,
3291     }
3292     stdin()
3293         .set_raw_mode()
3294         .expect("failed to set terminal raw mode");
3295 
3296     let sys_allocator_mutex = Arc::new(Mutex::new(sys_allocator));
3297     let iommu_host_tube = iommu_host_tube.map(|t| Arc::new(Mutex::new(t)));
3298 
3299     let wait_ctx = WaitContext::build_with(&[
3300         (&linux.suspend_evt, Token::Suspend),
3301         (&sigchld_fd, Token::ChildSignal),
3302         (&vm_evt_rdtube, Token::VmEvent),
3303         #[cfg(feature = "registered_events")]
3304         (&reg_evt_rdtube, Token::RegisteredEvent),
3305     ])
3306     .context("failed to build wait context")?;
3307 
3308     if let Some(socket_server) = &control_server_socket {
3309         wait_ctx
3310             .add(socket_server, Token::VmControlServer)
3311             .context("failed to add descriptor to wait context")?;
3312     }
3313     let mut control_tubes = BTreeMap::from_iter(control_tubes.into_iter().enumerate());
3314     let mut next_control_id = control_tubes.len();
3315     for (id, socket) in control_tubes.iter() {
3316         wait_ctx
3317             .add(socket.as_ref(), Token::VmControl { id: *id })
3318             .context("failed to add descriptor to wait context")?;
3319     }
3320 
3321     #[cfg(feature = "balloon")]
3322     let mut balloon_tube = balloon_host_tube
3323         .map(|tube| -> Result<BalloonTube> {
3324             wait_ctx
3325                 .add(&tube, Token::BalloonTube)
3326                 .context("failed to add descriptor to wait context")?;
3327             Ok(BalloonTube::new(tube))
3328         })
3329         .transpose()
3330         .context("failed to create balloon tube")?;
3331 
3332     if cfg.jail_config.is_some() {
3333         // Before starting VCPUs, in case we started with some capabilities, drop them all.
3334         drop_capabilities().context("failed to drop process capabilities")?;
3335     }
3336 
3337     #[cfg(feature = "gdb")]
3338     // Create a channel for GDB thread.
3339     let (to_gdb_channel, from_vcpu_channel) = if linux.gdb.is_some() {
3340         let (s, r) = mpsc::channel();
3341         (Some(s), Some(r))
3342     } else {
3343         (None, None)
3344     };
3345 
3346     let (device_ctrl_tube, device_ctrl_resp) = Tube::pair().context("failed to create tube")?;
3347     // Create devices thread, and restore if a restore file exists.
3348     linux.devices_thread = match create_devices_worker_thread(
3349         linux.vm.get_memory().clone(),
3350         linux.io_bus.clone(),
3351         linux.mmio_bus.clone(),
3352         device_ctrl_resp,
3353     ) {
3354         Ok(join_handle) => Some(join_handle),
3355         Err(e) => {
3356             return Err(anyhow!("Failed to start devices thread: {}", e));
3357         }
3358     };
3359 
3360     let mut vcpu_handles = Vec::with_capacity(linux.vcpu_count);
3361     let vcpu_thread_barrier = Arc::new(Barrier::new(linux.vcpu_count + 1));
3362 
3363     if !linux
3364         .vm
3365         .get_hypervisor()
3366         .check_capability(HypervisorCap::ImmediateExit)
3367     {
3368         return Err(anyhow!(
3369             "missing required hypervisor capability ImmediateExit"
3370         ));
3371     }
3372 
3373     vcpu::setup_vcpu_signal_handler()?;
3374 
3375     let vcpus: Vec<Option<_>> = match linux.vcpus.take() {
3376         Some(vec) => vec.into_iter().map(Some).collect(),
3377         None => iter::repeat_with(|| None).take(linux.vcpu_count).collect(),
3378     };
3379     // Enable core scheduling before creating vCPUs so that the cookie will be
3380     // shared by all vCPU threads.
3381     // TODO(b/199312402): Avoid enabling core scheduling for the crosvm process
3382     // itself for even better performance. Only vCPUs need the feature.
3383     if cfg.core_scheduling && cfg.per_vm_core_scheduling {
3384         if let Err(e) = enable_core_scheduling() {
3385             error!("Failed to enable core scheduling: {}", e);
3386         }
3387     }
3388     let vcpu_cgroup_tasks_file = match &cfg.vcpu_cgroup_path {
3389         None => None,
3390         Some(cgroup_path) => {
3391             // Move main process to cgroup_path
3392             let mut f = File::create(&cgroup_path.join("tasks")).with_context(|| {
3393                 format!(
3394                     "failed to create vcpu-cgroup-path {}",
3395                     cgroup_path.display(),
3396                 )
3397             })?;
3398             f.write_all(process::id().to_string().as_bytes())?;
3399             Some(f)
3400         }
3401     };
3402     #[cfg(target_arch = "x86_64")]
3403     let bus_lock_ratelimit_ctrl: Arc<Mutex<Ratelimit>> = Arc::new(Mutex::new(Ratelimit::new()));
3404     #[cfg(target_arch = "x86_64")]
3405     if cfg.bus_lock_ratelimit > 0 {
3406         let bus_lock_ratelimit = cfg.bus_lock_ratelimit;
3407         if linux.vm.check_capability(VmCap::BusLockDetect) {
3408             info!("Hypervisor support bus lock detect");
3409             linux
3410                 .vm
3411                 .enable_capability(VmCap::BusLockDetect, 0)
3412                 .expect("kvm: Failed to enable bus lock detection cap");
3413             info!("Hypervisor enabled bus lock detect");
3414             bus_lock_ratelimit_ctrl
3415                 .lock()
3416                 .ratelimit_set_speed(bus_lock_ratelimit);
3417         } else {
3418             bail!("Kvm: bus lock detection unsuported");
3419         }
3420     }
3421 
3422     #[cfg(target_os = "android")]
3423     android::set_process_profiles(&cfg.task_profiles)?;
3424 
3425     #[allow(unused_mut)]
3426     let mut run_mode = if cfg.suspended {
3427         // Sleep devices before creating vcpus.
3428         device_ctrl_tube
3429             .send(&DeviceControlCommand::SleepDevices)
3430             .context("send command to devices control socket")?;
3431         match device_ctrl_tube
3432             .recv()
3433             .context("receive from devices control socket")?
3434         {
3435             VmResponse::Ok => (),
3436             resp => bail!("device sleep failed: {}", resp),
3437         }
3438         VmRunMode::Suspending
3439     } else {
3440         VmRunMode::Running
3441     };
3442     #[cfg(feature = "gdb")]
3443     if to_gdb_channel.is_some() {
3444         // Wait until a GDB client attaches
3445         run_mode = VmRunMode::Breakpoint;
3446     }
3447     // If we are restoring from a snapshot, then start suspended.
3448     let (run_mode, post_restore_run_mode) = if cfg.restore_path.is_some() {
3449         (VmRunMode::Suspending, run_mode)
3450     } else {
3451         (run_mode, run_mode)
3452     };
3453 
3454     #[cfg(feature = "pvclock")]
3455     let pvclock_host_tube = pvclock_host_tube.map(Arc::new);
3456 
3457     // Architecture-specific code must supply a vcpu_init element for each VCPU.
3458     assert_eq!(vcpus.len(), linux.vcpu_init.len());
3459 
3460     for ((cpu_id, vcpu), vcpu_init) in vcpus.into_iter().enumerate().zip(linux.vcpu_init.drain(..))
3461     {
3462         let (to_vcpu_channel, from_main_channel) = mpsc::channel();
3463         let vcpu_affinity = match linux.vcpu_affinity.clone() {
3464             Some(VcpuAffinity::Global(v)) => v,
3465             Some(VcpuAffinity::PerVcpu(mut m)) => m.remove(&cpu_id).unwrap_or_default(),
3466             None => Default::default(),
3467         };
3468 
3469         #[cfg(target_arch = "x86_64")]
3470         let vcpu_hybrid_type = if !cfg.vcpu_hybrid_type.is_empty() {
3471             Some(*cfg.vcpu_hybrid_type.get(&cpu_id).unwrap())
3472         } else {
3473             None
3474         };
3475 
3476         #[cfg(target_arch = "x86_64")]
3477         let cpu_config = Some(CpuConfigX86_64::new(
3478             cfg.force_calibrated_tsc_leaf,
3479             cfg.host_cpu_topology,
3480             cfg.enable_hwp,
3481             cfg.no_smt,
3482             cfg.itmt,
3483             vcpu_hybrid_type,
3484         ));
3485         #[cfg(target_arch = "x86_64")]
3486         let bus_lock_ratelimit_ctrl = Arc::clone(&bus_lock_ratelimit_ctrl);
3487 
3488         #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
3489         let cpu_config = None;
3490 
3491         #[cfg(target_arch = "riscv64")]
3492         let cpu_config = Some(CpuConfigRiscv64::new(vcpu_init.fdt_address));
3493 
3494         let handle = vcpu::run_vcpu(
3495             cpu_id,
3496             vcpu_ids[cpu_id],
3497             vcpu,
3498             vcpu_init,
3499             linux.vm.try_clone().context("failed to clone vm")?,
3500             linux
3501                 .irq_chip
3502                 .try_box_clone()
3503                 .context("failed to clone irqchip")?,
3504             linux.vcpu_count,
3505             linux.rt_cpus.contains(&cpu_id),
3506             vcpu_affinity,
3507             linux.delay_rt,
3508             vcpu_thread_barrier.clone(),
3509             (*linux.io_bus).clone(),
3510             (*linux.mmio_bus).clone(),
3511             vm_evt_wrtube
3512                 .try_clone()
3513                 .context("failed to clone vm event tube")?,
3514             from_main_channel,
3515             #[cfg(feature = "gdb")]
3516             to_gdb_channel.clone(),
3517             cfg.core_scheduling,
3518             cfg.per_vm_core_scheduling,
3519             cpu_config,
3520             match vcpu_cgroup_tasks_file {
3521                 None => None,
3522                 Some(ref f) => Some(
3523                     f.try_clone()
3524                         .context("failed to clone vcpu cgroup tasks file")?,
3525                 ),
3526             },
3527             #[cfg(target_arch = "x86_64")]
3528             bus_lock_ratelimit_ctrl,
3529             run_mode,
3530             cfg.boost_uclamp,
3531         )?;
3532         vcpu_handles.push((handle, to_vcpu_channel));
3533     }
3534 
3535     #[cfg(feature = "gdb")]
3536     // Spawn GDB thread.
3537     if let Some((gdb_port_num, gdb_control_tube)) = linux.gdb.take() {
3538         let to_vcpu_channels = vcpu_handles
3539             .iter()
3540             .map(|(_handle, channel)| channel.clone())
3541             .collect();
3542         let target = GdbStub::new(
3543             gdb_control_tube,
3544             to_vcpu_channels,
3545             from_vcpu_channel.unwrap(), // Must succeed to unwrap()
3546         );
3547         std::thread::Builder::new()
3548             .name("gdb".to_owned())
3549             .spawn(move || gdb_thread(target, gdb_port_num))
3550             .context("failed to spawn GDB thread")?;
3551     };
3552 
3553     let (irq_handler_control, irq_handler_control_for_thread) = Tube::pair()?;
3554     let sys_allocator_for_thread = sys_allocator_mutex.clone();
3555     let irq_chip_for_thread = linux.irq_chip.try_box_clone()?;
3556     let irq_handler_thread = std::thread::Builder::new()
3557         .name("irq_handler_thread".into())
3558         .spawn(move || {
3559             irq_handler_thread(
3560                 irq_control_tubes,
3561                 irq_chip_for_thread,
3562                 sys_allocator_for_thread,
3563                 irq_handler_control_for_thread,
3564             )
3565         })
3566         .unwrap();
3567 
3568     let (vm_memory_handler_control, vm_memory_handler_control_for_thread) = Tube::pair()?;
3569     let vm_memory_handler_thread = std::thread::Builder::new()
3570         .name("vm_memory_handler_thread".into())
3571         .spawn({
3572             let vm = linux.vm.try_clone().context("failed to clone Vm")?;
3573             let sys_allocator_mutex = sys_allocator_mutex.clone();
3574             let iommu_client = iommu_host_tube
3575                 .as_ref()
3576                 .map(|t| VmMemoryRequestIommuClient::new(t.clone()));
3577             move || {
3578                 vm_memory_handler_thread(
3579                     vm_memory_control_tubes,
3580                     vm,
3581                     sys_allocator_mutex,
3582                     gralloc,
3583                     iommu_client,
3584                     vm_memory_handler_control_for_thread,
3585                 )
3586             }
3587         })
3588         .unwrap();
3589 
3590     vcpu_thread_barrier.wait();
3591 
3592     // Restore VM (if applicable).
3593     // Must happen after the vCPU barrier to avoid deadlock.
3594     if let Some(path) = &cfg.restore_path {
3595         vm_control::do_restore(
3596             path.clone(),
3597             &linux.vm,
3598             |msg| vcpu::kick_all_vcpus(&vcpu_handles, linux.irq_chip.as_irq_chip(), msg),
3599             |msg, index| {
3600                 vcpu::kick_vcpu(&vcpu_handles.get(index), linux.irq_chip.as_irq_chip(), msg)
3601             },
3602             &irq_handler_control,
3603             &device_ctrl_tube,
3604             linux.vcpu_count,
3605             |image| {
3606                 linux
3607                     .irq_chip
3608                     .try_box_clone()?
3609                     .restore(image, linux.vcpu_count)
3610             },
3611             /* require_encrypted= */ false,
3612         )?;
3613         // Allow the vCPUs to start for real.
3614         vcpu::kick_all_vcpus(
3615             &vcpu_handles,
3616             linux.irq_chip.as_irq_chip(),
3617             VcpuControl::RunState(post_restore_run_mode),
3618         )
3619     }
3620 
3621     #[cfg(feature = "swap")]
3622     if let Some(swap_controller) = &swap_controller {
3623         swap_controller
3624             .on_static_devices_setup_complete()
3625             .context("static device setup complete")?;
3626     }
3627 
3628     let metrics_thread = if metrics::is_initialized() {
3629         Some(
3630             std::thread::Builder::new()
3631                 .name("metrics_thread".into())
3632                 .spawn(move || {
3633                     if let Err(e) = MetricsController::new(vec![metrics_tube]).run() {
3634                         error!("Metrics controller error: {:?}", e);
3635                     }
3636                 })
3637                 .context("metrics thread failed")?,
3638         )
3639     } else {
3640         None
3641     };
3642 
3643     let mut exit_state = ExitState::Stop;
3644     let mut pvpanic_code = PvPanicCode::Unknown;
3645     #[cfg(feature = "registered_events")]
3646     let mut registered_evt_tubes: HashMap<RegisteredEvent, HashSet<AddressedProtoTube>> =
3647         HashMap::new();
3648 
3649     'wait: loop {
3650         let events = {
3651             match wait_ctx.wait() {
3652                 Ok(v) => v,
3653                 Err(e) => {
3654                     error!("failed to poll: {}", e);
3655                     break;
3656                 }
3657             }
3658         };
3659 
3660         let mut vm_control_ids_to_remove = Vec::new();
3661         for event in events.iter().filter(|e| e.is_readable) {
3662             match event.token {
3663                 #[cfg(feature = "registered_events")]
3664                 Token::RegisteredEvent => match reg_evt_rdtube.recv::<RegisteredEventWithData>() {
3665                     Ok(reg_evt) => {
3666                         let evt = reg_evt.into_event();
3667                         let mut tubes_to_remove: Vec<String> = Vec::new();
3668                         if let Some(tubes) = registered_evt_tubes.get_mut(&evt) {
3669                             for tube in tubes.iter() {
3670                                 if let Err(e) = tube.send(&reg_evt.into_proto()) {
3671                                     warn!(
3672                                         "failed to send registered event {:?} to {}, removing from \
3673                                          registrations: {}",
3674                                         reg_evt, tube.socket_addr, e
3675                                     );
3676                                     tubes_to_remove.push(tube.socket_addr.clone());
3677                                 }
3678                             }
3679                         }
3680                         for tube_addr in tubes_to_remove {
3681                             for tubes in registered_evt_tubes.values_mut() {
3682                                 tubes.retain(|t| t.socket_addr != tube_addr);
3683                             }
3684                         }
3685                         registered_evt_tubes.retain(|_, tubes| !tubes.is_empty());
3686                     }
3687                     Err(e) => {
3688                         warn!("failed to recv RegisteredEvent: {}", e);
3689                     }
3690                 },
3691                 Token::VmEvent => {
3692                     let mut break_to_wait: bool = true;
3693                     match vm_evt_rdtube.recv::<VmEventType>() {
3694                         Ok(vm_event) => match vm_event {
3695                             VmEventType::Exit => {
3696                                 info!("vcpu requested shutdown");
3697                                 exit_state = ExitState::Stop;
3698                             }
3699                             VmEventType::Reset => {
3700                                 info!("vcpu requested reset");
3701                                 exit_state = ExitState::Reset;
3702                             }
3703                             VmEventType::Crash => {
3704                                 info!("vcpu crashed");
3705                                 exit_state = ExitState::Crash;
3706                             }
3707                             VmEventType::Panic(panic_code) => {
3708                                 pvpanic_code = PvPanicCode::from_u8(panic_code);
3709                                 info!("Guest reported panic [Code: {}]", pvpanic_code);
3710                                 break_to_wait = false;
3711                             }
3712                             VmEventType::WatchdogReset => {
3713                                 info!("vcpu stall detected");
3714                                 exit_state = ExitState::WatchdogReset;
3715                             }
3716                         },
3717                         Err(e) => {
3718                             warn!("failed to recv VmEvent: {}", e);
3719                         }
3720                     }
3721                     if break_to_wait {
3722                         if pvpanic_code == PvPanicCode::Panicked {
3723                             exit_state = ExitState::GuestPanic;
3724                         }
3725                         break 'wait;
3726                     }
3727                 }
3728                 Token::Suspend => {
3729                     info!("VM requested suspend");
3730                     linux.suspend_evt.wait().unwrap();
3731                     vcpu::kick_all_vcpus(
3732                         &vcpu_handles,
3733                         linux.irq_chip.as_irq_chip(),
3734                         VcpuControl::RunState(VmRunMode::Suspending),
3735                     );
3736                 }
3737                 Token::ChildSignal => {
3738                     // Print all available siginfo structs, then exit the loop if child process has
3739                     // been exited except CLD_STOPPED and CLD_CONTINUED. the two should be ignored
3740                     // here since they are used by the vmm-swap feature.
3741                     let mut do_exit = false;
3742                     while let Some(siginfo) =
3743                         sigchld_fd.read().context("failed to read signalfd")?
3744                     {
3745                         let pid = siginfo.ssi_pid;
3746                         let pid_label = match linux.pid_debug_label_map.get(&pid) {
3747                             Some(label) => format!("{} (pid {})", label, pid),
3748                             None => format!("pid {}", pid),
3749                         };
3750 
3751                         // TODO(kawasin): this is a temporary exception until device suspension.
3752                         #[cfg(feature = "swap")]
3753                         if siginfo.ssi_code == libc::CLD_STOPPED
3754                             || siginfo.ssi_code == libc::CLD_CONTINUED
3755                         {
3756                             continue;
3757                         }
3758 
3759                         // Ignore clean exits of non-tracked child processes when running without
3760                         // sandboxing. The virtio gpu process launches a render server for
3761                         // pass-through graphics. Host GPU drivers have been observed to fork
3762                         // child processes that exit cleanly which should not be considered a
3763                         // crash. When running with sandboxing, this should be handled by the
3764                         // device's process handler.
3765                         if cfg.jail_config.is_none()
3766                             && !linux.pid_debug_label_map.contains_key(&pid)
3767                             && siginfo.ssi_signo == libc::SIGCHLD as u32
3768                             && siginfo.ssi_code == libc::CLD_EXITED
3769                             && siginfo.ssi_status == 0
3770                         {
3771                             continue;
3772                         }
3773 
3774                         error!(
3775                             "child {} exited: signo {}, status {}, code {}",
3776                             pid_label, siginfo.ssi_signo, siginfo.ssi_status, siginfo.ssi_code
3777                         );
3778                         do_exit = true;
3779                     }
3780                     if do_exit {
3781                         exit_state = ExitState::Crash;
3782                         break 'wait;
3783                     }
3784                 }
3785                 Token::VmControlServer => {
3786                     if let Some(socket_server) = &control_server_socket {
3787                         match socket_server.accept() {
3788                             Ok(socket) => {
3789                                 let id = next_control_id;
3790                                 next_control_id += 1;
3791                                 wait_ctx
3792                                     .add(&socket, Token::VmControl { id })
3793                                     .context("failed to add descriptor to wait context")?;
3794                                 control_tubes.insert(
3795                                     id,
3796                                     TaggedControlTube::Vm(Tube::new_from_unix_seqpacket(socket)?),
3797                                 );
3798                             }
3799                             Err(e) => error!("failed to accept socket: {}", e),
3800                         }
3801                     }
3802                 }
3803                 Token::VmControl { id } => {
3804                     if let Some(socket) = control_tubes.get(&id) {
3805                         let mut state = ControlLoopState {
3806                             linux: &mut linux,
3807                             cfg: &cfg,
3808                             sys_allocator: &sys_allocator_mutex,
3809                             control_tubes: &control_tubes,
3810                             disk_host_tubes,
3811                             #[cfg(feature = "gpu")]
3812                             gpu_control_tube: &gpu_control_tube,
3813                             #[cfg(feature = "usb")]
3814                             usb_control_tube: &usb_control_tube,
3815                             #[cfg(target_arch = "x86_64")]
3816                             iommu_host_tube: &iommu_host_tube,
3817                             #[cfg(target_arch = "x86_64")]
3818                             hp_control_tube: &hp_control_tube,
3819                             guest_suspended_cvar: &guest_suspended_cvar,
3820                             #[cfg(feature = "pci-hotplug")]
3821                             hotplug_manager: &mut hotplug_manager,
3822                             #[cfg(feature = "swap")]
3823                             swap_controller: &mut swap_controller,
3824                             vcpu_handles: &vcpu_handles,
3825                             #[cfg(feature = "balloon")]
3826                             balloon_tube: balloon_tube.as_mut(),
3827                             device_ctrl_tube: &device_ctrl_tube,
3828                             irq_handler_control: &irq_handler_control,
3829                             #[cfg(any(target_arch = "x86_64", feature = "pci-hotplug"))]
3830                             vm_memory_handler_control: &vm_memory_handler_control,
3831                             #[cfg(feature = "registered_events")]
3832                             registered_evt_tubes: &mut registered_evt_tubes,
3833                             #[cfg(feature = "pvclock")]
3834                             pvclock_host_tube: pvclock_host_tube.clone(),
3835                         };
3836                         let (exit_requested, mut ids_to_remove, add_tubes) =
3837                             process_vm_control_event(&mut state, id, socket)?;
3838                         if exit_requested {
3839                             break 'wait;
3840                         }
3841                         vm_control_ids_to_remove.append(&mut ids_to_remove);
3842                         for socket in add_tubes {
3843                             let id = next_control_id;
3844                             next_control_id += 1;
3845                             wait_ctx
3846                                 .add(socket.as_ref(), Token::VmControl { id })
3847                                 .context(
3848                                     "failed to add hotplug vfio-pci descriptor to wait context",
3849                                 )?;
3850                             control_tubes.insert(id, socket);
3851                         }
3852                     }
3853                 }
3854                 #[cfg(feature = "balloon")]
3855                 Token::BalloonTube => {
3856                     match balloon_tube.as_mut().expect("missing balloon tube").recv() {
3857                         Ok(resp) => {
3858                             for (resp, idx) in resp {
3859                                 if let Some(TaggedControlTube::Vm(tube)) = control_tubes.get(&idx) {
3860                                     if let Err(e) = tube.send(&resp) {
3861                                         error!("failed to send VmResponse: {}", e);
3862                                     }
3863                                 } else {
3864                                     error!("Bad tube index {}", idx);
3865                                 }
3866                             }
3867                         }
3868                         Err(err) => {
3869                             error!("Error processing balloon tube {:?}", err)
3870                         }
3871                     }
3872                 }
3873             }
3874         }
3875 
3876         remove_hungup_and_drained_tubes(
3877             &events,
3878             &wait_ctx,
3879             &mut control_tubes,
3880             vm_control_ids_to_remove,
3881             |token: &Token| {
3882                 if let Token::VmControl { id } = token {
3883                     return Some(*id);
3884                 }
3885                 None
3886             },
3887         )?;
3888     }
3889 
3890     vcpu::kick_all_vcpus(
3891         &vcpu_handles,
3892         linux.irq_chip.as_irq_chip(),
3893         VcpuControl::RunState(VmRunMode::Exiting),
3894     );
3895     for (handle, _) in vcpu_handles {
3896         if let Err(e) = handle.join() {
3897             error!("failed to join vcpu thread: {:?}", e);
3898         }
3899     }
3900 
3901     // After joining all vcpu threads, unregister the process-wide signal handler.
3902     if let Err(e) = vcpu::remove_vcpu_signal_handler() {
3903         error!("failed to remove vcpu thread signal handler: {:#}", e);
3904     }
3905 
3906     // Stop the vmm-swap monitor process.
3907     #[cfg(feature = "swap")]
3908     drop(swap_controller);
3909 
3910     // Stop pci root worker thread
3911     #[cfg(target_arch = "x86_64")]
3912     {
3913         let _ = hp_control_tube.send(PciRootCommand::Kill);
3914         if let Err(e) = hp_thread.join() {
3915             error!("failed to join hotplug thread: {:?}", e);
3916         }
3917     }
3918 
3919     if linux.devices_thread.is_some() {
3920         if let Err(e) = device_ctrl_tube.send(&DeviceControlCommand::Exit) {
3921             error!("failed to stop device control loop: {}", e);
3922         };
3923         if let Some(thread) = linux.devices_thread.take() {
3924             if let Err(e) = thread.join() {
3925                 error!("failed to exit devices thread: {:?}", e);
3926             }
3927         }
3928     }
3929 
3930     // Shut down the VM Memory handler thread.
3931     if let Err(e) = vm_memory_handler_control.send(&VmMemoryHandlerRequest::Exit) {
3932         error!(
3933             "failed to request exit from VM Memory handler thread: {}",
3934             e
3935         );
3936     }
3937     if let Err(e) = vm_memory_handler_thread.join() {
3938         error!("failed to exit VM Memory handler thread: {:?}", e);
3939     }
3940 
3941     // Shut down the IRQ handler thread.
3942     if let Err(e) = irq_handler_control.send(&IrqHandlerRequest::Exit) {
3943         error!("failed to request exit from IRQ handler thread: {}", e);
3944     }
3945     if let Err(e) = irq_handler_thread.join() {
3946         error!("failed to exit irq handler thread: {:?}", e);
3947     }
3948 
3949     // At this point, the only remaining `Arc` references to the `Bus` objects should be the ones
3950     // inside `linux`. If the checks below fail, then some other thread is probably still running
3951     // and needs to be explicitly stopped before dropping `linux` to ensure devices actually get
3952     // cleaned up.
3953     match Arc::try_unwrap(std::mem::replace(
3954         &mut linux.mmio_bus,
3955         Arc::new(Bus::new(BusType::Mmio)),
3956     )) {
3957         Ok(_) => {}
3958         Err(_) => panic!("internal error: mmio_bus had more than one reference at shutdown"),
3959     }
3960     match Arc::try_unwrap(std::mem::replace(
3961         &mut linux.io_bus,
3962         Arc::new(Bus::new(BusType::Io)),
3963     )) {
3964         Ok(_) => {}
3965         Err(_) => panic!("internal error: io_bus had more than one reference at shutdown"),
3966     }
3967 
3968     // Explicitly drop the VM structure here to allow the devices to clean up before the
3969     // control sockets are closed when this function exits.
3970     mem::drop(linux);
3971 
3972     // Drop the hotplug manager to tell the warden process to exit before we try to join
3973     // the metrics thread.
3974     #[cfg(feature = "pci-hotplug")]
3975     mem::drop(hotplug_manager);
3976 
3977     // All our children should have exited by now, so closing our fd should
3978     // terminate metrics. Then join so that everything gets flushed.
3979     metrics::get_destructor().cleanup();
3980     if let Some(metrics_thread) = metrics_thread {
3981         if let Err(e) = metrics_thread.join() {
3982             error!("failed to exit irq handler thread: {:?}", e);
3983         }
3984     }
3985 
3986     stdin()
3987         .set_canon_mode()
3988         .expect("failed to restore canonical mode for terminal");
3989 
3990     Ok(exit_state)
3991 }
3992 
3993 #[derive(EventToken)]
3994 enum IrqHandlerToken {
3995     IrqFd { index: IrqEventIndex },
3996     VmIrq { id: usize },
3997     DelayedIrqFd,
3998     HandlerControl,
3999 }
4000 
4001 /// Handles IRQs and requests from devices to add additional IRQ lines.
irq_handler_thread( irq_control_tubes: Vec<Tube>, mut irq_chip: Box<dyn IrqChipArch + 'static>, sys_allocator_mutex: Arc<Mutex<SystemAllocator>>, handler_control: Tube, ) -> anyhow::Result<()>4002 fn irq_handler_thread(
4003     irq_control_tubes: Vec<Tube>,
4004     mut irq_chip: Box<dyn IrqChipArch + 'static>,
4005     sys_allocator_mutex: Arc<Mutex<SystemAllocator>>,
4006     handler_control: Tube,
4007 ) -> anyhow::Result<()> {
4008     let wait_ctx = WaitContext::build_with(&[(
4009         handler_control.get_read_notifier(),
4010         IrqHandlerToken::HandlerControl,
4011     )])
4012     .context("failed to build wait context")?;
4013 
4014     if let Some(delayed_ioapic_irq_trigger) = irq_chip.irq_delayed_event_token()? {
4015         wait_ctx
4016             .add(&delayed_ioapic_irq_trigger, IrqHandlerToken::DelayedIrqFd)
4017             .context("failed to add descriptor to wait context")?;
4018     }
4019 
4020     let mut irq_event_tokens = irq_chip
4021         .irq_event_tokens()
4022         .context("failed get event tokens from irqchip")?;
4023 
4024     for (index, _gsi, evt) in irq_event_tokens.iter() {
4025         wait_ctx
4026             .add(evt, IrqHandlerToken::IrqFd { index: *index })
4027             .context("failed to add irq chip event tokens to wait context")?;
4028     }
4029 
4030     let mut irq_control_tubes = BTreeMap::from_iter(irq_control_tubes.into_iter().enumerate());
4031     let mut next_control_id = irq_control_tubes.len();
4032     for (id, socket) in irq_control_tubes.iter() {
4033         wait_ctx
4034             .add(
4035                 socket.get_read_notifier(),
4036                 IrqHandlerToken::VmIrq { id: *id },
4037             )
4038             .context("irq control tubes to wait context")?;
4039     }
4040 
4041     'wait: loop {
4042         let events = {
4043             match wait_ctx.wait() {
4044                 Ok(v) => v,
4045                 Err(e) => {
4046                     error!("failed to poll: {}", e);
4047                     break 'wait;
4048                 }
4049             }
4050         };
4051         let token_count = events.len();
4052         let mut vm_irq_tubes_to_remove = Vec::new();
4053         let mut notify_control_on_iteration_end = false;
4054 
4055         for event in events.iter().filter(|e| e.is_readable) {
4056             match event.token {
4057                 IrqHandlerToken::HandlerControl => {
4058                     match handler_control.recv::<IrqHandlerRequest>() {
4059                         Ok(request) => {
4060                             match request {
4061                                 IrqHandlerRequest::Exit => break 'wait,
4062                                 IrqHandlerRequest::AddIrqControlTubes(tubes) => {
4063                                     for socket in tubes {
4064                                         let id = next_control_id;
4065                                         next_control_id += 1;
4066                                         wait_ctx
4067                                         .add(
4068                                             socket.get_read_notifier(),
4069                                             IrqHandlerToken::VmIrq { id },
4070                                         )
4071                                         .context("failed to add new IRQ control Tube to wait context")?;
4072                                         irq_control_tubes.insert(id, socket);
4073                                     }
4074                                 }
4075                                 IrqHandlerRequest::RefreshIrqEventTokens => {
4076                                     for (_index, _gsi, evt) in irq_event_tokens.iter() {
4077                                         wait_ctx.delete(evt).context(
4078                                             "failed to remove irq chip event \
4079                                                 token from wait context",
4080                                         )?;
4081                                     }
4082 
4083                                     irq_event_tokens = irq_chip
4084                                         .irq_event_tokens()
4085                                         .context("failed get event tokens from irqchip")?;
4086                                     for (index, _gsi, evt) in irq_event_tokens.iter() {
4087                                         wait_ctx
4088                                             .add(evt, IrqHandlerToken::IrqFd { index: *index })
4089                                             .context(
4090                                                 "failed to add irq chip event \
4091                                                 tokens to wait context",
4092                                             )?;
4093                                     }
4094 
4095                                     if let Err(e) = handler_control
4096                                         .send(&IrqHandlerResponse::IrqEventTokenRefreshComplete)
4097                                     {
4098                                         error!(
4099                                             "failed to notify IRQ event token refresh \
4100                                             was completed: {}",
4101                                             e
4102                                         );
4103                                     }
4104                                 }
4105                                 IrqHandlerRequest::WakeAndNotifyIteration => {
4106                                     notify_control_on_iteration_end = true;
4107                                 }
4108                             }
4109                         }
4110                         Err(e) => {
4111                             if let TubeError::Disconnected = e {
4112                                 panic!("irq handler control tube disconnected.");
4113                             } else {
4114                                 error!("failed to recv IrqHandlerRequest: {}", e);
4115                             }
4116                         }
4117                     }
4118                 }
4119                 IrqHandlerToken::VmIrq { id } => {
4120                     if let Some(tube) = irq_control_tubes.get(&id) {
4121                         handle_irq_tube_request(
4122                             &sys_allocator_mutex,
4123                             &mut irq_chip,
4124                             &mut vm_irq_tubes_to_remove,
4125                             &wait_ctx,
4126                             tube,
4127                             id,
4128                         );
4129                     }
4130                 }
4131                 IrqHandlerToken::IrqFd { index } => {
4132                     if let Err(e) = irq_chip.service_irq_event(index) {
4133                         error!("failed to signal irq {}: {}", index, e);
4134                     }
4135                 }
4136                 IrqHandlerToken::DelayedIrqFd => {
4137                     if let Err(e) = irq_chip.process_delayed_irq_events() {
4138                         warn!("can't deliver delayed irqs: {}", e);
4139                     }
4140                 }
4141             }
4142         }
4143 
4144         if notify_control_on_iteration_end {
4145             if let Err(e) = handler_control.send(&IrqHandlerResponse::HandlerIterationComplete(
4146                 token_count - 1,
4147             )) {
4148                 error!(
4149                     "failed to notify on iteration completion (snapshotting may fail): {}",
4150                     e
4151                 );
4152             }
4153         }
4154 
4155         remove_hungup_and_drained_tubes(
4156             &events,
4157             &wait_ctx,
4158             &mut irq_control_tubes,
4159             vm_irq_tubes_to_remove,
4160             |token: &IrqHandlerToken| {
4161                 if let IrqHandlerToken::VmIrq { id } = token {
4162                     return Some(*id);
4163                 }
4164                 None
4165             },
4166         )?;
4167         if events.iter().any(|e| {
4168             e.is_hungup && !e.is_readable && matches!(e.token, IrqHandlerToken::HandlerControl)
4169         }) {
4170             error!("IRQ handler control hung up but did not request an exit.");
4171             break 'wait;
4172         }
4173     }
4174     Ok(())
4175 }
4176 
handle_irq_tube_request( sys_allocator_mutex: &Arc<Mutex<SystemAllocator>>, irq_chip: &mut Box<dyn IrqChipArch + 'static>, vm_irq_tubes_to_remove: &mut Vec<usize>, wait_ctx: &WaitContext<IrqHandlerToken>, tube: &Tube, tube_index: usize, )4177 fn handle_irq_tube_request(
4178     sys_allocator_mutex: &Arc<Mutex<SystemAllocator>>,
4179     irq_chip: &mut Box<dyn IrqChipArch + 'static>,
4180     vm_irq_tubes_to_remove: &mut Vec<usize>,
4181     wait_ctx: &WaitContext<IrqHandlerToken>,
4182     tube: &Tube,
4183     tube_index: usize,
4184 ) {
4185     match tube.recv::<VmIrqRequest>() {
4186         Ok(request) => {
4187             let response = {
4188                 request.execute(
4189                     |setup| match setup {
4190                         IrqSetup::Event(irq, ev, device_id, queue_id, device_name) => {
4191                             let irq_evt = devices::IrqEdgeEvent::from_event(ev.try_clone()?);
4192                             let source = IrqEventSource {
4193                                 device_id: device_id.try_into().expect("Invalid device_id"),
4194                                 queue_id,
4195                                 device_name,
4196                             };
4197                             if let Some(event_index) =
4198                                 irq_chip.register_edge_irq_event(irq, &irq_evt, source)?
4199                             {
4200                                 if let Err(e) =
4201                                     wait_ctx.add(ev, IrqHandlerToken::IrqFd { index: event_index })
4202                                 {
4203                                     warn!("failed to add IrqFd to poll context: {}", e);
4204                                     return Err(e);
4205                                 }
4206                             }
4207                             Ok(())
4208                         }
4209                         IrqSetup::Route(route) => irq_chip.route_irq(route),
4210                         IrqSetup::UnRegister(irq, ev) => {
4211                             let irq_evt = devices::IrqEdgeEvent::from_event(ev.try_clone()?);
4212                             irq_chip.unregister_edge_irq_event(irq, &irq_evt)
4213                         }
4214                     },
4215                     &mut sys_allocator_mutex.lock(),
4216                 )
4217             };
4218             if let Err(e) = tube.send(&response) {
4219                 error!("failed to send VmIrqResponse: {}", e);
4220             }
4221         }
4222         Err(e) => {
4223             if let TubeError::Disconnected = e {
4224                 vm_irq_tubes_to_remove.push(tube_index);
4225             } else {
4226                 error!("failed to recv VmIrqRequest: {}", e);
4227             }
4228         }
4229     }
4230 }
4231 
4232 /// Commands to control the VM Memory handler thread.
4233 #[derive(serde::Serialize, serde::Deserialize)]
4234 pub enum VmMemoryHandlerRequest {
4235     /// No response is sent for this command.
4236     AddControlTubes(Vec<VmMemoryTube>),
4237     /// No response is sent for this command.
4238     Exit,
4239 }
4240 
vm_memory_handler_thread( control_tubes: Vec<VmMemoryTube>, mut vm: impl Vm, sys_allocator_mutex: Arc<Mutex<SystemAllocator>>, mut gralloc: RutabagaGralloc, mut iommu_client: Option<VmMemoryRequestIommuClient>, handler_control: Tube, ) -> anyhow::Result<()>4241 fn vm_memory_handler_thread(
4242     control_tubes: Vec<VmMemoryTube>,
4243     mut vm: impl Vm,
4244     sys_allocator_mutex: Arc<Mutex<SystemAllocator>>,
4245     mut gralloc: RutabagaGralloc,
4246     mut iommu_client: Option<VmMemoryRequestIommuClient>,
4247     handler_control: Tube,
4248 ) -> anyhow::Result<()> {
4249     #[derive(EventToken)]
4250     enum Token {
4251         VmControl { id: usize },
4252         HandlerControl,
4253     }
4254 
4255     let wait_ctx =
4256         WaitContext::build_with(&[(handler_control.get_read_notifier(), Token::HandlerControl)])
4257             .context("failed to build wait context")?;
4258     let mut control_tubes = BTreeMap::from_iter(control_tubes.into_iter().enumerate());
4259     let mut next_control_id = control_tubes.len();
4260     for (id, socket) in control_tubes.iter() {
4261         wait_ctx
4262             .add(socket.as_ref(), Token::VmControl { id: *id })
4263             .context("failed to add descriptor to wait context")?;
4264     }
4265 
4266     let mut region_state = VmMemoryRegionState::new();
4267 
4268     'wait: loop {
4269         let events = {
4270             match wait_ctx.wait() {
4271                 Ok(v) => v,
4272                 Err(e) => {
4273                     error!("failed to poll: {}", e);
4274                     break;
4275                 }
4276             }
4277         };
4278 
4279         let mut vm_control_ids_to_remove = Vec::new();
4280         for event in events.iter().filter(|e| e.is_readable) {
4281             match event.token {
4282                 Token::HandlerControl => match handler_control.recv::<VmMemoryHandlerRequest>() {
4283                     Ok(request) => match request {
4284                         VmMemoryHandlerRequest::Exit => break 'wait,
4285                         VmMemoryHandlerRequest::AddControlTubes(tubes) => {
4286                             for socket in tubes {
4287                                 let id = next_control_id;
4288                                 next_control_id += 1;
4289                                 wait_ctx
4290                                     .add(socket.get_read_notifier(), Token::VmControl { id })
4291                                     .context(
4292                                         "failed to add new vm memory control Tube to wait context",
4293                                     )?;
4294                                 control_tubes.insert(id, socket);
4295                             }
4296                         }
4297                     },
4298                     Err(e) => {
4299                         if let TubeError::Disconnected = e {
4300                             panic!("vm memory control tube disconnected.");
4301                         } else {
4302                             error!("failed to recv VmMemoryHandlerRequest: {}", e);
4303                         }
4304                     }
4305                 },
4306                 Token::VmControl { id } => {
4307                     if let Some(VmMemoryTube {
4308                         tube,
4309                         expose_with_viommu,
4310                     }) = control_tubes.get(&id)
4311                     {
4312                         match tube.recv::<VmMemoryRequest>() {
4313                             Ok(request) => {
4314                                 let response = request.execute(
4315                                     &mut vm,
4316                                     &mut sys_allocator_mutex.lock(),
4317                                     &mut gralloc,
4318                                     if *expose_with_viommu {
4319                                         iommu_client.as_mut()
4320                                     } else {
4321                                         None
4322                                     },
4323                                     &mut region_state,
4324                                 );
4325                                 if let Err(e) = tube.send(&response) {
4326                                     error!("failed to send VmMemoryControlResponse: {}", e);
4327                                 }
4328                             }
4329                             Err(e) => {
4330                                 if let TubeError::Disconnected = e {
4331                                     vm_control_ids_to_remove.push(id);
4332                                 } else {
4333                                     error!("failed to recv VmMemoryControlRequest: {}", e);
4334                                 }
4335                             }
4336                         }
4337                     }
4338                 }
4339             }
4340         }
4341 
4342         remove_hungup_and_drained_tubes(
4343             &events,
4344             &wait_ctx,
4345             &mut control_tubes,
4346             vm_control_ids_to_remove,
4347             |token: &Token| {
4348                 if let Token::VmControl { id } = token {
4349                     return Some(*id);
4350                 }
4351                 None
4352             },
4353         )?;
4354         if events
4355             .iter()
4356             .any(|e| e.is_hungup && !e.is_readable && matches!(e.token, Token::HandlerControl))
4357         {
4358             error!("vm memory handler control hung up but did not request an exit.");
4359             break 'wait;
4360         }
4361     }
4362     Ok(())
4363 }
4364 
4365 /// When control tubes hang up, we want to make sure that we've fully drained
4366 /// the underlying socket before removing it. This function also handles
4367 /// removing closed sockets in such a way that avoids phantom events.
4368 ///
4369 /// `tube_ids_to_remove` is the set of ids that we already know should
4370 /// be removed (e.g. from getting a disconnect error on read).
remove_hungup_and_drained_tubes<T, U>( events: &SmallVec<[TriggeredEvent<T>; 16]>, wait_ctx: &WaitContext<T>, tubes: &mut BTreeMap<usize, U>, mut tube_ids_to_remove: Vec<usize>, get_tube_id: fn(token: &T) -> Option<usize>, ) -> anyhow::Result<()> where T: EventToken, U: ReadNotifier,4371 fn remove_hungup_and_drained_tubes<T, U>(
4372     events: &SmallVec<[TriggeredEvent<T>; 16]>,
4373     wait_ctx: &WaitContext<T>,
4374     tubes: &mut BTreeMap<usize, U>,
4375     mut tube_ids_to_remove: Vec<usize>,
4376     get_tube_id: fn(token: &T) -> Option<usize>,
4377 ) -> anyhow::Result<()>
4378 where
4379     T: EventToken,
4380     U: ReadNotifier,
4381 {
4382     // It's possible more data is readable and buffered while the socket is hungup,
4383     // so don't delete the tube from the poll context until we're sure all the
4384     // data is read.
4385     // Below case covers a condition where we have received a hungup event and the tube is not
4386     // readable.
4387     // In case of readable tube, once all data is read, any attempt to read more data on hungup
4388     // tube should fail. On such failure, we get Disconnected error and ids gets added to
4389     // tube_ids_to_remove by the time we reach here.
4390     for event in events.iter().filter(|e| e.is_hungup && !e.is_readable) {
4391         if let Some(id) = get_tube_id(&event.token) {
4392             tube_ids_to_remove.push(id);
4393         }
4394     }
4395 
4396     tube_ids_to_remove.dedup();
4397     for id in tube_ids_to_remove {
4398         // Delete the socket from the `wait_ctx` synchronously. Otherwise, the kernel will do
4399         // this automatically when the FD inserted into the `wait_ctx` is closed after this
4400         // if-block, but this removal can be deferred unpredictably. In some instances where the
4401         // system is under heavy load, we can even get events returned by `wait_ctx` for an FD
4402         // that has already been closed. Because the token associated with that spurious event
4403         // now belongs to a different socket, the control loop will start to interact with
4404         // sockets that might not be ready to use. This can cause incorrect hangup detection or
4405         // blocking on a socket that will never be ready. See also: crbug.com/1019986
4406         if let Some(socket) = tubes.remove(&id) {
4407             wait_ctx
4408                 .delete(socket.get_read_notifier())
4409                 .context("failed to remove descriptor from wait context")?;
4410         }
4411     }
4412     Ok(())
4413 }
4414 
4415 /// Start and jail a vhost-user device according to its configuration and a vhost listener string.
4416 ///
4417 /// The jailing business is nasty and potentially unsafe if done from the wrong context - do not
4418 /// call outside of `start_devices`!
4419 ///
4420 /// Returns the pid of the jailed device process.
jail_and_start_vu_device<T: VirtioDeviceBuilder>( jail_config: &Option<JailConfig>, params: T, vhost: &str, name: &str, ) -> anyhow::Result<(libc::pid_t, Option<Box<dyn std::any::Any>>)>4421 fn jail_and_start_vu_device<T: VirtioDeviceBuilder>(
4422     jail_config: &Option<JailConfig>,
4423     params: T,
4424     vhost: &str,
4425     name: &str,
4426 ) -> anyhow::Result<(libc::pid_t, Option<Box<dyn std::any::Any>>)> {
4427     let mut keep_rds = Vec::new();
4428 
4429     base::syslog::push_descriptors(&mut keep_rds);
4430     cros_tracing::push_descriptors!(&mut keep_rds);
4431     metrics::push_descriptors(&mut keep_rds);
4432 
4433     let jail_type = VirtioDeviceType::VhostUser;
4434 
4435     // Create a jail from the configuration. If the configuration is `None`, `create_jail` will also
4436     // return `None` so fall back to an empty (i.e. non-constrained) Minijail.
4437     let jail = params
4438         .create_jail(jail_config, jail_type)
4439         .with_context(|| format!("failed to create jail for {}", name))?
4440         .ok_or(())
4441         .or_else(|_| Minijail::new())
4442         .with_context(|| format!("failed to create empty jail for {}", name))?;
4443 
4444     // Create the device in the parent process, so the child does not need any privileges necessary
4445     // to do it (only runtime capabilities are required).
4446     let device = params
4447         .create_vhost_user_device(&mut keep_rds)
4448         .context("failed to create vhost-user device")?;
4449     let mut listener = VhostUserListener::new(vhost, Some(&mut keep_rds))
4450         .context("failed to create the vhost listener")?;
4451     let parent_resources = listener.take_parent_process_resources();
4452 
4453     // Executor must be created before jail in order to prevent the jailed process from creating
4454     // unrestricted io_urings.
4455     let ex = Executor::new().context("Failed to create an Executor")?;
4456     keep_rds.extend(ex.as_raw_descriptors());
4457 
4458     // Deduplicate the FDs since minijail expects them to be unique.
4459     keep_rds.sort_unstable();
4460     keep_rds.dedup();
4461 
4462     // SAFETY:
4463     // Safe because we are keeping all the descriptors needed for the child to function.
4464     match unsafe { jail.fork(Some(&keep_rds)).context("error while forking")? } {
4465         0 => {
4466             // In the child process.
4467 
4468             // Free memory for the resources managed by the parent, without running drop() on them.
4469             // The parent will do it as we exit.
4470             let _ = std::mem::ManuallyDrop::new(parent_resources);
4471 
4472             // Make sure the child process does not survive its parent.
4473             // SAFETY: trivially safe
4474             if unsafe { libc::prctl(libc::PR_SET_PDEATHSIG, libc::SIGKILL) } < 0 {
4475                 panic!("call to prctl(PR_SET_DEATHSIG, SIGKILL) failed. Aborting child process.");
4476             }
4477 
4478             // Set the name for the thread.
4479             const MAX_LEN: usize = 15; // pthread_setname_np() limit on Linux
4480             let debug_label_trimmed = &name.as_bytes()[..std::cmp::min(MAX_LEN, name.len())];
4481             let thread_name = CString::new(debug_label_trimmed).unwrap();
4482             // SAFETY:
4483             // Safe because we trimmed the name to 15 characters (and pthread_setname_np will return
4484             // an error if we don't anyway).
4485             let _ = unsafe { libc::pthread_setname_np(libc::pthread_self(), thread_name.as_ptr()) };
4486 
4487             // Run the device loop and terminate the child process once it exits.
4488             let res = match listener.run_device(ex, device) {
4489                 Ok(()) => 0,
4490                 Err(e) => {
4491                     error!("error while running device {}: {:#}", name, e);
4492                     1
4493                 }
4494             };
4495             // SAFETY: trivially safe
4496             unsafe { libc::exit(res) };
4497         }
4498         pid => {
4499             // In the parent process. We will drop the device and listener when exiting this method.
4500             // This is fine as ownership for both has been transferred to the child process and they
4501             // will keep living there. We just retain `parent_resources` for things we are supposed
4502             // to clean up ourselves.
4503 
4504             info!("process for device {} (PID {}) started", &name, pid);
4505             #[cfg(feature = "seccomp_trace")]
4506             debug!(
4507                     "seccomp_trace {{\"event\": \"minijail_fork\", \"pid\": {}, \"name\": \"{}\", \"jail_addr\": \"0x{:x}\"}}",
4508                     pid,
4509                     &name,
4510                     read_jail_addr(&jail)
4511                 );
4512             Ok((pid, parent_resources))
4513         }
4514     }
4515 }
4516 
process_vhost_user_control_request(tube: Tube, disk_host_tubes: &[Tube]) -> Result<()>4517 fn process_vhost_user_control_request(tube: Tube, disk_host_tubes: &[Tube]) -> Result<()> {
4518     let command = tube
4519         .recv::<VmRequest>()
4520         .context("failed to receive VmRequest")?;
4521     let resp = match command {
4522         VmRequest::DiskCommand {
4523             disk_index,
4524             ref command,
4525         } => match &disk_host_tubes.get(disk_index) {
4526             Some(tube) => handle_disk_command(command, tube),
4527             None => VmResponse::Err(base::Error::new(libc::ENODEV)),
4528         },
4529         request => {
4530             error!(
4531                 "Request {:?} currently not supported in vhost user backend",
4532                 request
4533             );
4534             VmResponse::Err(base::Error::new(libc::EPERM))
4535         }
4536     };
4537 
4538     tube.send(&resp).context("failed to send VmResponse")?;
4539     Ok(())
4540 }
4541 
start_vhost_user_control_server( control_server_socket: UnlinkUnixSeqpacketListener, disk_host_tubes: Vec<Tube>, )4542 fn start_vhost_user_control_server(
4543     control_server_socket: UnlinkUnixSeqpacketListener,
4544     disk_host_tubes: Vec<Tube>,
4545 ) {
4546     info!("Start vhost-user control server");
4547     loop {
4548         match control_server_socket.accept() {
4549             Ok(socket) => {
4550                 let tube = match Tube::new_from_unix_seqpacket(socket) {
4551                     Ok(tube) => tube,
4552                     Err(e) => {
4553                         error!("failed to open tube: {:#}", e);
4554                         return;
4555                     }
4556                 };
4557                 if let Err(e) = process_vhost_user_control_request(tube, &disk_host_tubes) {
4558                     error!("failed to process control request: {:#}", e);
4559                 }
4560             }
4561             Err(e) => {
4562                 error!("failed to establish connection: {}", e);
4563             }
4564         }
4565     }
4566 }
4567 
start_devices(opts: DevicesCommand) -> anyhow::Result<()>4568 pub fn start_devices(opts: DevicesCommand) -> anyhow::Result<()> {
4569     if let Some(async_executor) = opts.async_executor {
4570         Executor::set_default_executor_kind(async_executor)
4571             .context("Failed to set the default async executor")?;
4572     }
4573 
4574     struct DeviceJailInfo {
4575         // Unique name for the device, in the form `foomatic-0`.
4576         name: String,
4577         _drop_resources: Option<Box<dyn std::any::Any>>,
4578     }
4579 
4580     fn add_device<T: VirtioDeviceBuilder>(
4581         i: usize,
4582         device_params: T,
4583         vhost: &str,
4584         jail_config: &Option<JailConfig>,
4585         devices_jails: &mut BTreeMap<libc::pid_t, DeviceJailInfo>,
4586     ) -> anyhow::Result<()> {
4587         let name = format!("{}-{}", T::NAME, i);
4588 
4589         let (pid, _drop_resources) =
4590             jail_and_start_vu_device::<T>(jail_config, device_params, vhost, &name)?;
4591 
4592         devices_jails.insert(
4593             pid,
4594             DeviceJailInfo {
4595                 name,
4596                 _drop_resources,
4597             },
4598         );
4599 
4600         Ok(())
4601     }
4602 
4603     let mut devices_jails: BTreeMap<libc::pid_t, DeviceJailInfo> = BTreeMap::new();
4604 
4605     let jail = if opts.disable_sandbox {
4606         None
4607     } else {
4608         Some(opts.jail)
4609     };
4610 
4611     // Create control server socket
4612     let control_server_socket = opts.control_socket.map(|path| {
4613         UnlinkUnixSeqpacketListener(
4614             UnixSeqpacketListener::bind(path).expect("Could not bind socket"),
4615         )
4616     });
4617 
4618     // Create serial devices.
4619     for (i, params) in opts.serial.iter().enumerate() {
4620         let serial_config = &params.device;
4621         add_device(i, serial_config, &params.vhost, &jail, &mut devices_jails)?;
4622     }
4623 
4624     let mut disk_host_tubes = Vec::new();
4625     let control_socket_exists = control_server_socket.is_some();
4626     // Create block devices.
4627     for (i, params) in opts.block.iter().enumerate() {
4628         let tube = if control_socket_exists {
4629             let (host_tube, device_tube) = Tube::pair().context("failed to create tube")?;
4630             disk_host_tubes.push(host_tube);
4631             Some(device_tube)
4632         } else {
4633             None
4634         };
4635         let disk_config = DiskConfig::new(&params.device, tube);
4636         add_device(i, disk_config, &params.vhost, &jail, &mut devices_jails)?;
4637     }
4638 
4639     // Create vsock devices.
4640     for (i, params) in opts.vsock.iter().enumerate() {
4641         add_device(i, &params.device, &params.vhost, &jail, &mut devices_jails)?;
4642     }
4643 
4644     // Create network devices.
4645     #[cfg(feature = "net")]
4646     for (i, params) in opts.net.iter().enumerate() {
4647         add_device(i, &params.device, &params.vhost, &jail, &mut devices_jails)?;
4648     }
4649 
4650     // No device created, that's probably not intended - print the help in that case.
4651     if devices_jails.is_empty() {
4652         let err = DevicesCommand::from_args(
4653             &[&std::env::args().next().unwrap_or(String::from("crosvm"))],
4654             &["--help"],
4655         )
4656         .unwrap_err();
4657         println!("{}", err.output);
4658         return Ok(());
4659     }
4660 
4661     let ex = Executor::new()?;
4662     if let Some(control_server_socket) = control_server_socket {
4663         // Start the control server in the parent process.
4664         ex.spawn_blocking(move || {
4665             start_vhost_user_control_server(control_server_socket, disk_host_tubes)
4666         })
4667         .detach();
4668     }
4669 
4670     // Now wait for all device processes to return.
4671     while !devices_jails.is_empty() {
4672         match base::linux::wait_for_pid(-1, 0) {
4673             Err(e) => panic!("error waiting for child process to complete: {:#}", e),
4674             Ok((Some(pid), wait_status)) => match devices_jails.remove_entry(&pid) {
4675                 Some((_, info)) => {
4676                     if let Some(status) = wait_status.code() {
4677                         info!(
4678                             "process for device {} (PID {}) exited with code {}",
4679                             &info.name, pid, status
4680                         );
4681                     } else if let Some(signal) = wait_status.signal() {
4682                         warn!(
4683                             "process for device {} (PID {}) has been killed by signal {:?}",
4684                             &info.name, pid, signal,
4685                         );
4686                     }
4687                 }
4688                 None => error!("pid {} is not one of our device processes", pid),
4689             },
4690             // `wait_for_pid` will necessarily return a PID because we asked to it wait for one to
4691             // complete.
4692             Ok((None, _)) => unreachable!(),
4693         }
4694     }
4695 
4696     info!("all device processes have exited");
4697 
4698     Ok(())
4699 }
4700 
4701 /// Setup crash reporting for a process. Each process MUST provide a unique `product_type` to avoid
4702 /// making crash reports incomprehensible.
4703 #[cfg(feature = "crash-report")]
setup_emulator_crash_reporting(_cfg: &Config) -> anyhow::Result<String>4704 pub fn setup_emulator_crash_reporting(_cfg: &Config) -> anyhow::Result<String> {
4705     crash_report::setup_crash_reporting(crash_report::CrashReportAttributes {
4706         product_type: "emulator".to_owned(),
4707         pipe_name: None,
4708         report_uuid: None,
4709         product_name: None,
4710         product_version: None,
4711     })
4712 }
4713 
4714 #[cfg(test)]
4715 mod tests {
4716     use std::path::PathBuf;
4717 
4718     use super::*;
4719 
4720     // Create a file-backed mapping parameters struct with the given `address` and `size` and other
4721     // parameters set to default values.
test_file_backed_mapping(address: u64, size: u64) -> FileBackedMappingParameters4722     fn test_file_backed_mapping(address: u64, size: u64) -> FileBackedMappingParameters {
4723         FileBackedMappingParameters {
4724             address,
4725             size,
4726             path: PathBuf::new(),
4727             offset: 0,
4728             writable: false,
4729             sync: false,
4730             align: false,
4731         }
4732     }
4733 
4734     #[test]
guest_mem_file_backed_mappings_overlap()4735     fn guest_mem_file_backed_mappings_overlap() {
4736         // Base case: no file mappings; output layout should be identical.
4737         assert_eq!(
4738             punch_holes_in_guest_mem_layout_for_mappings(
4739                 vec![
4740                     (GuestAddress(0), 0xD000_0000, Default::default()),
4741                     (GuestAddress(0x1_0000_0000), 0x8_0000, Default::default()),
4742                 ],
4743                 &[]
4744             ),
4745             vec![
4746                 (GuestAddress(0), 0xD000_0000, Default::default()),
4747                 (GuestAddress(0x1_0000_0000), 0x8_0000, Default::default()),
4748             ]
4749         );
4750 
4751         // File mapping that does not overlap guest memory.
4752         assert_eq!(
4753             punch_holes_in_guest_mem_layout_for_mappings(
4754                 vec![
4755                     (GuestAddress(0), 0xD000_0000, Default::default()),
4756                     (GuestAddress(0x1_0000_0000), 0x8_0000, Default::default()),
4757                 ],
4758                 &[test_file_backed_mapping(0xD000_0000, 0x1000)]
4759             ),
4760             vec![
4761                 (GuestAddress(0), 0xD000_0000, Default::default()),
4762                 (GuestAddress(0x1_0000_0000), 0x8_0000, Default::default()),
4763             ]
4764         );
4765 
4766         // File mapping at the start of the low address space region.
4767         assert_eq!(
4768             punch_holes_in_guest_mem_layout_for_mappings(
4769                 vec![
4770                     (GuestAddress(0), 0xD000_0000, Default::default()),
4771                     (GuestAddress(0x1_0000_0000), 0x8_0000, Default::default()),
4772                 ],
4773                 &[test_file_backed_mapping(0, 0x2000)]
4774             ),
4775             vec![
4776                 (
4777                     GuestAddress(0x2000),
4778                     0xD000_0000 - 0x2000,
4779                     Default::default()
4780                 ),
4781                 (GuestAddress(0x1_0000_0000), 0x8_0000, Default::default()),
4782             ]
4783         );
4784 
4785         // File mapping at the end of the low address space region.
4786         assert_eq!(
4787             punch_holes_in_guest_mem_layout_for_mappings(
4788                 vec![
4789                     (GuestAddress(0), 0xD000_0000, Default::default()),
4790                     (GuestAddress(0x1_0000_0000), 0x8_0000, Default::default()),
4791                 ],
4792                 &[test_file_backed_mapping(0xD000_0000 - 0x2000, 0x2000)]
4793             ),
4794             vec![
4795                 (GuestAddress(0), 0xD000_0000 - 0x2000, Default::default()),
4796                 (GuestAddress(0x1_0000_0000), 0x8_0000, Default::default()),
4797             ]
4798         );
4799 
4800         // File mapping fully contained within the middle of the low address space region.
4801         assert_eq!(
4802             punch_holes_in_guest_mem_layout_for_mappings(
4803                 vec![
4804                     (GuestAddress(0), 0xD000_0000, Default::default()),
4805                     (GuestAddress(0x1_0000_0000), 0x8_0000, Default::default()),
4806                 ],
4807                 &[test_file_backed_mapping(0x1000, 0x2000)]
4808             ),
4809             vec![
4810                 (GuestAddress(0), 0x1000, Default::default()),
4811                 (
4812                     GuestAddress(0x3000),
4813                     0xD000_0000 - 0x3000,
4814                     Default::default()
4815                 ),
4816                 (GuestAddress(0x1_0000_0000), 0x8_0000, Default::default()),
4817             ]
4818         );
4819 
4820         // File mapping at the start of the high address space region.
4821         assert_eq!(
4822             punch_holes_in_guest_mem_layout_for_mappings(
4823                 vec![
4824                     (GuestAddress(0), 0xD000_0000, Default::default()),
4825                     (GuestAddress(0x1_0000_0000), 0x8_0000, Default::default()),
4826                 ],
4827                 &[test_file_backed_mapping(0x1_0000_0000, 0x2000)]
4828             ),
4829             vec![
4830                 (GuestAddress(0), 0xD000_0000, Default::default()),
4831                 (
4832                     GuestAddress(0x1_0000_2000),
4833                     0x8_0000 - 0x2000,
4834                     Default::default()
4835                 ),
4836             ]
4837         );
4838 
4839         // File mapping at the end of the high address space region.
4840         assert_eq!(
4841             punch_holes_in_guest_mem_layout_for_mappings(
4842                 vec![
4843                     (GuestAddress(0), 0xD000_0000, Default::default()),
4844                     (GuestAddress(0x1_0000_0000), 0x8_0000, Default::default()),
4845                 ],
4846                 &[test_file_backed_mapping(0x1_0008_0000 - 0x2000, 0x2000)]
4847             ),
4848             vec![
4849                 (GuestAddress(0), 0xD000_0000, Default::default()),
4850                 (
4851                     GuestAddress(0x1_0000_0000),
4852                     0x8_0000 - 0x2000,
4853                     Default::default()
4854                 ),
4855             ]
4856         );
4857 
4858         // File mapping fully contained within the middle of the high address space region.
4859         assert_eq!(
4860             punch_holes_in_guest_mem_layout_for_mappings(
4861                 vec![
4862                     (GuestAddress(0), 0xD000_0000, Default::default()),
4863                     (GuestAddress(0x1_0000_0000), 0x8_0000, Default::default()),
4864                 ],
4865                 &[test_file_backed_mapping(0x1_0000_1000, 0x2000)]
4866             ),
4867             vec![
4868                 (GuestAddress(0), 0xD000_0000, Default::default()),
4869                 (GuestAddress(0x1_0000_0000), 0x1000, Default::default()),
4870                 (
4871                     GuestAddress(0x1_0000_3000),
4872                     0x8_0000 - 0x3000,
4873                     Default::default()
4874                 ),
4875             ]
4876         );
4877 
4878         // File mapping overlapping two guest memory regions.
4879         assert_eq!(
4880             punch_holes_in_guest_mem_layout_for_mappings(
4881                 vec![
4882                     (GuestAddress(0), 0xD000_0000, Default::default()),
4883                     (GuestAddress(0x1_0000_0000), 0x8_0000, Default::default()),
4884                 ],
4885                 &[test_file_backed_mapping(0xA000_0000, 0x60002000)]
4886             ),
4887             vec![
4888                 (GuestAddress(0), 0xA000_0000, Default::default()),
4889                 (
4890                     GuestAddress(0x1_0000_2000),
4891                     0x8_0000 - 0x2000,
4892                     Default::default()
4893                 ),
4894             ]
4895         );
4896     }
4897 }
4898