• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2022 The ChromiumOS Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #[cfg(target_os = "android")]
6 mod android;
7 pub mod cmdline;
8 pub mod config;
9 mod device_helpers;
10 #[cfg(feature = "gpu")]
11 pub(crate) mod gpu;
12 mod vcpu;
13 
14 use std::cmp::max;
15 use std::cmp::Reverse;
16 use std::collections::BTreeMap;
17 use std::collections::BTreeSet;
18 use std::collections::HashMap;
19 use std::collections::HashSet;
20 use std::convert::TryInto;
21 use std::ffi::CString;
22 use std::fs::File;
23 use std::fs::OpenOptions;
24 use std::hash::Hash;
25 use std::io::prelude::*;
26 use std::io::stdin;
27 use std::iter;
28 use std::mem;
29 use std::ops::RangeInclusive;
30 use std::os::unix::prelude::OpenOptionsExt;
31 use std::os::unix::process::ExitStatusExt;
32 use std::path::Path;
33 use std::process;
34 use std::rc::Rc;
35 use std::sync::mpsc;
36 use std::sync::Arc;
37 use std::sync::Barrier;
38 #[cfg(feature = "balloon")]
39 use std::time::Duration;
40 
41 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
42 use aarch64::AArch64 as Arch;
43 use acpi_tables::sdt::SDT;
44 use anyhow::anyhow;
45 use anyhow::bail;
46 use anyhow::Context;
47 use anyhow::Result;
48 use arch::LinuxArch;
49 use arch::RunnableLinuxVm;
50 use arch::VcpuAffinity;
51 use arch::VirtioDeviceStub;
52 use arch::VmComponents;
53 use arch::VmImage;
54 use base::ReadNotifier;
55 #[cfg(feature = "balloon")]
56 use base::UnixSeqpacket;
57 use base::UnixSeqpacketListener;
58 use base::UnlinkUnixSeqpacketListener;
59 use base::*;
60 use cros_async::Executor;
61 use device_helpers::*;
62 use devices::create_devices_worker_thread;
63 use devices::serial_device::SerialHardware;
64 use devices::vfio::VfioCommonSetup;
65 use devices::vfio::VfioCommonTrait;
66 #[cfg(feature = "gpu")]
67 use devices::virtio;
68 use devices::virtio::device_constants::video::VideoDeviceType;
69 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
70 use devices::virtio::memory_mapper::MemoryMapper;
71 use devices::virtio::memory_mapper::MemoryMapperTrait;
72 use devices::virtio::vhost::user::VhostUserListener;
73 use devices::virtio::vhost::user::VhostUserListenerTrait;
74 #[cfg(feature = "balloon")]
75 use devices::virtio::BalloonFeatures;
76 #[cfg(feature = "balloon")]
77 use devices::virtio::BalloonMode;
78 #[cfg(feature = "gpu")]
79 use devices::virtio::EventDevice;
80 use devices::virtio::VirtioTransportType;
81 #[cfg(feature = "audio")]
82 use devices::Ac97Dev;
83 use devices::Bus;
84 use devices::BusDeviceObj;
85 use devices::CoIommuDev;
86 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
87 #[cfg(feature = "geniezone")]
88 use devices::GeniezoneKernelIrqChip;
89 #[cfg(feature = "usb")]
90 use devices::HostBackendDeviceProvider;
91 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
92 use devices::HostHotPlugKey;
93 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
94 use devices::HotPlugBus;
95 use devices::IommuDevType;
96 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
97 use devices::IrqChipAArch64 as IrqChipArch;
98 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
99 use devices::IrqChipX86_64 as IrqChipArch;
100 use devices::IrqEventIndex;
101 use devices::IrqEventSource;
102 use devices::KvmKernelIrqChip;
103 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
104 use devices::KvmSplitIrqChip;
105 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
106 use devices::PciAddress;
107 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
108 use devices::PciBridge;
109 use devices::PciDevice;
110 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
111 use devices::PciRoot;
112 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
113 use devices::PciRootCommand;
114 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
115 use devices::PcieDownstreamPort;
116 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
117 use devices::PcieHostPort;
118 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
119 use devices::PcieRootPort;
120 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
121 use devices::PcieUpstreamPort;
122 use devices::PvPanicCode;
123 use devices::PvPanicPciDevice;
124 use devices::StubPciDevice;
125 use devices::VirtioMmioDevice;
126 use devices::VirtioPciDevice;
127 #[cfg(feature = "usb")]
128 use devices::XhciController;
129 #[cfg(feature = "gpu")]
130 use gpu::*;
131 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
132 #[cfg(feature = "geniezone")]
133 use hypervisor::geniezone::Geniezone;
134 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
135 #[cfg(feature = "geniezone")]
136 use hypervisor::geniezone::GeniezoneVcpu;
137 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
138 #[cfg(feature = "geniezone")]
139 use hypervisor::geniezone::GeniezoneVm;
140 use hypervisor::kvm::Kvm;
141 use hypervisor::kvm::KvmVcpu;
142 use hypervisor::kvm::KvmVm;
143 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
144 use hypervisor::CpuConfigX86_64;
145 use hypervisor::Hypervisor;
146 use hypervisor::HypervisorCap;
147 use hypervisor::ProtectionType;
148 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
149 use hypervisor::VcpuAArch64 as VcpuArch;
150 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
151 use hypervisor::VcpuX86_64 as VcpuArch;
152 use hypervisor::Vm;
153 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
154 use hypervisor::VmAArch64 as VmArch;
155 use hypervisor::VmCap;
156 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
157 use hypervisor::VmX86_64 as VmArch;
158 use jail::*;
159 use libc;
160 use minijail::Minijail;
161 use resources::AddressRange;
162 use resources::Alloc;
163 #[cfg(feature = "direct")]
164 use resources::Error as ResourceError;
165 use resources::SystemAllocator;
166 use rutabaga_gfx::RutabagaGralloc;
167 use serde::Serialize;
168 use smallvec::SmallVec;
169 #[cfg(feature = "swap")]
170 use swap::SwapController;
171 use sync::Condvar;
172 use sync::Mutex;
173 use vm_control::*;
174 use vm_memory::GuestAddress;
175 use vm_memory::GuestMemory;
176 use vm_memory::MemoryPolicy;
177 use vm_memory::MemoryRegionOptions;
178 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
179 use x86_64::msr::get_override_msr_list;
180 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
181 use x86_64::X8664arch as Arch;
182 
183 use crate::crosvm::config::Config;
184 use crate::crosvm::config::Executable;
185 use crate::crosvm::config::FileBackedMappingParameters;
186 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
187 use crate::crosvm::config::HostPcieRootPortParameters;
188 use crate::crosvm::config::HypervisorKind;
189 use crate::crosvm::config::SharedDir;
190 use crate::crosvm::config::SharedDirKind;
191 #[cfg(all(any(target_arch = "x86_64", target_arch = "aarch64"), feature = "gdb"))]
192 use crate::crosvm::gdb::gdb_thread;
193 #[cfg(all(any(target_arch = "x86_64", target_arch = "aarch64"), feature = "gdb"))]
194 use crate::crosvm::gdb::GdbStub;
195 #[cfg(all(any(target_arch = "x86", target_arch = "x86_64"), unix))]
196 use crate::crosvm::ratelimit::Ratelimit;
197 use crate::crosvm::sys::cmdline::DevicesCommand;
198 
199 const KVM_PATH: &str = "/dev/kvm";
200 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
201 #[cfg(feature = "geniezone")]
202 const GENIEZONE_PATH: &str = "/dev/gzvm";
203 #[cfg(all(any(target_arch = "arm", target_arch = "aarch64"), feature = "gunyah"))]
204 static GUNYAH_PATH: &str = "/dev/gunyah";
205 
create_virtio_devices( cfg: &Config, vm: &mut impl Vm, resources: &mut SystemAllocator, #[cfg_attr(not(feature = "gpu"), allow(unused_variables))] vm_evt_wrtube: &SendTube, #[cfg(feature = "balloon")] balloon_device_tube: Option<Tube>, #[cfg(feature = "balloon")] balloon_wss_device_tube: Option<Tube>, #[cfg(feature = "balloon")] balloon_inflate_tube: Option<Tube>, #[cfg(feature = "balloon")] init_balloon_size: u64, disk_device_tubes: &mut Vec<Tube>, pmem_device_tubes: &mut Vec<Tube>, fs_device_tubes: &mut Vec<Tube>, #[cfg(feature = "gpu")] gpu_control_tube: Tube, #[cfg(feature = "gpu")] render_server_fd: Option<SafeDescriptor>, vvu_proxy_device_tubes: &mut Vec<Tube>, vvu_proxy_max_sibling_mem_size: u64, #[cfg_attr(not(feature = "balloon"), allow(unused_variables))] registered_evt_q: &SendTube, ) -> DeviceResult<Vec<VirtioDeviceStub>>206 fn create_virtio_devices(
207     cfg: &Config,
208     vm: &mut impl Vm,
209     resources: &mut SystemAllocator,
210     #[cfg_attr(not(feature = "gpu"), allow(unused_variables))] vm_evt_wrtube: &SendTube,
211     #[cfg(feature = "balloon")] balloon_device_tube: Option<Tube>,
212     #[cfg(feature = "balloon")] balloon_wss_device_tube: Option<Tube>,
213     #[cfg(feature = "balloon")] balloon_inflate_tube: Option<Tube>,
214     #[cfg(feature = "balloon")] init_balloon_size: u64,
215     disk_device_tubes: &mut Vec<Tube>,
216     pmem_device_tubes: &mut Vec<Tube>,
217     fs_device_tubes: &mut Vec<Tube>,
218     #[cfg(feature = "gpu")] gpu_control_tube: Tube,
219     #[cfg(feature = "gpu")] render_server_fd: Option<SafeDescriptor>,
220     vvu_proxy_device_tubes: &mut Vec<Tube>,
221     vvu_proxy_max_sibling_mem_size: u64,
222     #[cfg_attr(not(feature = "balloon"), allow(unused_variables))] registered_evt_q: &SendTube,
223 ) -> DeviceResult<Vec<VirtioDeviceStub>> {
224     let mut devs = Vec::new();
225 
226     for opt in &cfg.vhost_user_gpu {
227         devs.push(create_vhost_user_gpu_device(cfg.protection_type, opt)?);
228     }
229 
230     for opt in &cfg.vvu_proxy {
231         devs.push(create_vvu_proxy_device(
232             cfg.protection_type,
233             &cfg.jail_config,
234             opt,
235             vvu_proxy_device_tubes.remove(0),
236             vvu_proxy_max_sibling_mem_size,
237         )?);
238     }
239 
240     #[cfg(any(feature = "gpu", feature = "video-decoder", feature = "video-encoder"))]
241     let mut resource_bridges = Vec::<Tube>::new();
242 
243     if !cfg.wayland_socket_paths.is_empty() {
244         #[cfg_attr(not(feature = "gpu"), allow(unused_mut))]
245         let mut wl_resource_bridge = None::<Tube>;
246 
247         #[cfg(feature = "gpu")]
248         {
249             if cfg.gpu_parameters.is_some() {
250                 let (wl_socket, gpu_socket) = Tube::pair().context("failed to create tube")?;
251                 resource_bridges.push(gpu_socket);
252                 wl_resource_bridge = Some(wl_socket);
253             }
254         }
255 
256         devs.push(create_wayland_device(
257             cfg.protection_type,
258             &cfg.jail_config,
259             &cfg.wayland_socket_paths,
260             wl_resource_bridge,
261         )?);
262     }
263 
264     #[cfg(feature = "video-decoder")]
265     let video_dec_cfg = cfg
266         .video_dec
267         .iter()
268         .map(|config| {
269             let (video_tube, gpu_tube) =
270                 Tube::pair().expect("failed to create tube for video decoder");
271             resource_bridges.push(gpu_tube);
272             (video_tube, config.backend)
273         })
274         .collect::<Vec<_>>();
275 
276     #[cfg(feature = "video-encoder")]
277     let video_enc_cfg = cfg
278         .video_enc
279         .iter()
280         .map(|config| {
281             let (video_tube, gpu_tube) =
282                 Tube::pair().expect("failed to create tube for video encoder");
283             resource_bridges.push(gpu_tube);
284             (video_tube, config.backend)
285         })
286         .collect::<Vec<_>>();
287 
288     #[cfg(feature = "gpu")]
289     {
290         if let Some(gpu_parameters) = &cfg.gpu_parameters {
291             let display_param = if gpu_parameters.display_params.is_empty() {
292                 Default::default()
293             } else {
294                 gpu_parameters.display_params[0].clone()
295             };
296             let (gpu_display_w, gpu_display_h) = display_param.get_virtual_display_size();
297 
298             let mut event_devices = Vec::new();
299             if cfg.display_window_mouse {
300                 let (event_device_socket, virtio_dev_socket) =
301                     StreamChannel::pair(BlockingMode::Nonblocking, FramingMode::Byte)
302                         .context("failed to create socket")?;
303                 let (multi_touch_width, multi_touch_height) = cfg
304                     .virtio_multi_touch
305                     .first()
306                     .as_ref()
307                     .map(|multi_touch_spec| multi_touch_spec.get_size())
308                     .unwrap_or((gpu_display_w, gpu_display_h));
309                 let dev = virtio::new_multi_touch(
310                     // u32::MAX is the least likely to collide with the indices generated above for
311                     // the multi_touch options, which begin at 0.
312                     u32::MAX,
313                     virtio_dev_socket,
314                     multi_touch_width,
315                     multi_touch_height,
316                     virtio::base_features(cfg.protection_type),
317                 )
318                 .context("failed to set up mouse device")?;
319                 devs.push(VirtioDeviceStub {
320                     dev: Box::new(dev),
321                     jail: simple_jail(&cfg.jail_config, "input_device")?,
322                 });
323                 event_devices.push(EventDevice::touchscreen(event_device_socket));
324             }
325             if cfg.display_window_keyboard {
326                 let (event_device_socket, virtio_dev_socket) =
327                     StreamChannel::pair(BlockingMode::Nonblocking, FramingMode::Byte)
328                         .context("failed to create socket")?;
329                 let dev = virtio::new_keyboard(
330                     // u32::MAX is the least likely to collide with the indices generated above for
331                     // the multi_touch options, which begin at 0.
332                     u32::MAX,
333                     virtio_dev_socket,
334                     virtio::base_features(cfg.protection_type),
335                 )
336                 .context("failed to set up keyboard device")?;
337                 devs.push(VirtioDeviceStub {
338                     dev: Box::new(dev),
339                     jail: simple_jail(&cfg.jail_config, "input_device")?,
340                 });
341                 event_devices.push(EventDevice::keyboard(event_device_socket));
342             }
343 
344             devs.push(create_gpu_device(
345                 cfg,
346                 vm_evt_wrtube,
347                 gpu_control_tube,
348                 resource_bridges,
349                 // Use the unnamed socket for GPU display screens.
350                 cfg.wayland_socket_paths.get(""),
351                 cfg.x_display.clone(),
352                 render_server_fd,
353                 event_devices,
354             )?);
355         }
356     }
357 
358     for (_, param) in cfg
359         .serial_parameters
360         .iter()
361         .filter(|(_k, v)| v.hardware == SerialHardware::VirtioConsole)
362     {
363         let dev = param.create_virtio_device_and_jail(cfg.protection_type, &cfg.jail_config)?;
364         devs.push(dev);
365     }
366 
367     for disk in &cfg.disks {
368         let disk_config = DiskConfig::new(disk, Some(disk_device_tubes.remove(0)));
369         devs.push(
370             disk_config.create_virtio_device_and_jail(cfg.protection_type, &cfg.jail_config)?,
371         );
372     }
373 
374     for blk in &cfg.vhost_user_blk {
375         devs.push(create_vhost_user_block_device(cfg.protection_type, blk)?);
376     }
377 
378     for console in &cfg.vhost_user_console {
379         devs.push(create_vhost_user_console_device(
380             cfg.protection_type,
381             console,
382         )?);
383     }
384 
385     for (index, pmem_disk) in cfg.pmem_devices.iter().enumerate() {
386         let pmem_device_tube = pmem_device_tubes.remove(0);
387         devs.push(create_pmem_device(
388             cfg.protection_type,
389             &cfg.jail_config,
390             vm,
391             resources,
392             pmem_disk,
393             index,
394             pmem_device_tube,
395         )?);
396     }
397 
398     if cfg.rng {
399         devs.push(create_rng_device(cfg.protection_type, &cfg.jail_config)?);
400     }
401 
402     #[cfg(feature = "tpm")]
403     {
404         if cfg.software_tpm {
405             devs.push(create_software_tpm_device(
406                 cfg.protection_type,
407                 &cfg.jail_config,
408             )?);
409         }
410     }
411 
412     #[cfg(all(feature = "vtpm", target_arch = "x86_64"))]
413     {
414         if cfg.vtpm_proxy {
415             devs.push(create_vtpm_proxy_device(
416                 cfg.protection_type,
417                 &cfg.jail_config,
418             )?);
419         }
420     }
421 
422     for (idx, single_touch_spec) in cfg.virtio_single_touch.iter().enumerate() {
423         devs.push(create_single_touch_device(
424             cfg.protection_type,
425             &cfg.jail_config,
426             single_touch_spec,
427             idx as u32,
428         )?);
429     }
430 
431     for (idx, multi_touch_spec) in cfg.virtio_multi_touch.iter().enumerate() {
432         devs.push(create_multi_touch_device(
433             cfg.protection_type,
434             &cfg.jail_config,
435             multi_touch_spec,
436             idx as u32,
437         )?);
438     }
439 
440     for (idx, trackpad_spec) in cfg.virtio_trackpad.iter().enumerate() {
441         devs.push(create_trackpad_device(
442             cfg.protection_type,
443             &cfg.jail_config,
444             trackpad_spec,
445             idx as u32,
446         )?);
447     }
448 
449     for (idx, mouse_socket) in cfg.virtio_mice.iter().enumerate() {
450         devs.push(create_mouse_device(
451             cfg.protection_type,
452             &cfg.jail_config,
453             mouse_socket,
454             idx as u32,
455         )?);
456     }
457 
458     for (idx, keyboard_socket) in cfg.virtio_keyboard.iter().enumerate() {
459         devs.push(create_keyboard_device(
460             cfg.protection_type,
461             &cfg.jail_config,
462             keyboard_socket,
463             idx as u32,
464         )?);
465     }
466 
467     for (idx, switches_socket) in cfg.virtio_switches.iter().enumerate() {
468         devs.push(create_switches_device(
469             cfg.protection_type,
470             &cfg.jail_config,
471             switches_socket,
472             idx as u32,
473         )?);
474     }
475 
476     for dev_path in &cfg.virtio_input_evdevs {
477         devs.push(create_vinput_device(
478             cfg.protection_type,
479             &cfg.jail_config,
480             dev_path,
481         )?);
482     }
483 
484     #[cfg(feature = "balloon")]
485     if let Some(balloon_device_tube) = balloon_device_tube {
486         let balloon_features = (cfg.balloon_page_reporting as u64)
487             << BalloonFeatures::PageReporting as u64
488             | (cfg.balloon_wss_reporting as u64) << BalloonFeatures::WSSReporting as u64;
489         devs.push(create_balloon_device(
490             cfg.protection_type,
491             &cfg.jail_config,
492             if cfg.strict_balloon {
493                 BalloonMode::Strict
494             } else {
495                 BalloonMode::Relaxed
496             },
497             balloon_device_tube,
498             balloon_wss_device_tube,
499             balloon_inflate_tube,
500             init_balloon_size,
501             balloon_features,
502             Some(
503                 registered_evt_q
504                     .try_clone()
505                     .context("failed to clone registered_evt_q tube")?,
506             ),
507         )?);
508     }
509 
510     for opt in &cfg.net {
511         let vq_pairs = opt.vq_pairs.unwrap_or(1);
512         let vcpu_count = cfg.vcpu_count.unwrap_or(1);
513         let multi_vq = vq_pairs > 1 && !opt.vhost_net;
514         let (tap, mac) = create_tap_for_net_device(&opt.mode, multi_vq)?;
515         let dev = if opt.vhost_net {
516             create_virtio_vhost_net_device_from_tap(
517                 cfg.protection_type,
518                 &cfg.jail_config,
519                 vq_pairs,
520                 vcpu_count,
521                 cfg.vhost_net_device_path.clone(),
522                 tap,
523                 mac,
524             )
525         } else {
526             create_virtio_net_device_from_tap(
527                 cfg.protection_type,
528                 &cfg.jail_config,
529                 vq_pairs,
530                 vcpu_count,
531                 tap,
532                 mac,
533             )
534         }?;
535         devs.push(dev);
536     }
537 
538     for net in &cfg.vhost_user_net {
539         devs.push(create_vhost_user_net_device(cfg.protection_type, net)?);
540     }
541 
542     for vsock in &cfg.vhost_user_vsock {
543         devs.push(create_vhost_user_vsock_device(cfg.protection_type, vsock)?);
544     }
545 
546     for opt in &cfg.vhost_user_wl {
547         devs.push(create_vhost_user_wl_device(cfg.protection_type, opt)?);
548     }
549 
550     #[cfg(feature = "audio")]
551     {
552         for virtio_snd in &cfg.virtio_snds {
553             devs.push(create_virtio_snd_device(
554                 cfg.protection_type,
555                 &cfg.jail_config,
556                 virtio_snd.clone(),
557             )?);
558         }
559     }
560 
561     #[cfg(feature = "video-decoder")]
562     {
563         for (tube, backend) in video_dec_cfg {
564             register_video_device(
565                 backend,
566                 &mut devs,
567                 tube,
568                 cfg.protection_type,
569                 &cfg.jail_config,
570                 VideoDeviceType::Decoder,
571             )?;
572         }
573     }
574     for socket_path in &cfg.vhost_user_video_dec {
575         devs.push(create_vhost_user_video_device(
576             cfg.protection_type,
577             socket_path,
578             VideoDeviceType::Decoder,
579         )?);
580     }
581 
582     #[cfg(feature = "video-encoder")]
583     {
584         for (tube, backend) in video_enc_cfg {
585             register_video_device(
586                 backend,
587                 &mut devs,
588                 tube,
589                 cfg.protection_type,
590                 &cfg.jail_config,
591                 VideoDeviceType::Encoder,
592             )?;
593         }
594     }
595 
596     if let Some(vsock_config) = &cfg.vsock {
597         devs.push(
598             vsock_config.create_virtio_device_and_jail(cfg.protection_type, &cfg.jail_config)?,
599         );
600     }
601 
602     for vhost_user_fs in &cfg.vhost_user_fs {
603         devs.push(create_vhost_user_fs_device(
604             cfg.protection_type,
605             vhost_user_fs,
606         )?);
607     }
608 
609     for vhost_user_snd in &cfg.vhost_user_snd {
610         devs.push(create_vhost_user_snd_device(
611             cfg.protection_type,
612             vhost_user_snd,
613         )?);
614     }
615 
616     for shared_dir in &cfg.shared_dirs {
617         let SharedDir {
618             src,
619             tag,
620             kind,
621             uid_map,
622             gid_map,
623             fs_cfg,
624             p9_cfg,
625         } = shared_dir;
626 
627         let dev = match kind {
628             SharedDirKind::FS => {
629                 let device_tube = fs_device_tubes.remove(0);
630                 create_fs_device(
631                     cfg.protection_type,
632                     &cfg.jail_config,
633                     uid_map,
634                     gid_map,
635                     src,
636                     tag,
637                     fs_cfg.clone(),
638                     device_tube,
639                 )?
640             }
641             SharedDirKind::P9 => create_9p_device(
642                 cfg.protection_type,
643                 &cfg.jail_config,
644                 uid_map,
645                 gid_map,
646                 src,
647                 tag,
648                 p9_cfg.clone(),
649             )?,
650         };
651         devs.push(dev);
652     }
653 
654     if let Some(vhost_user_mac80211_hwsim) = &cfg.vhost_user_mac80211_hwsim {
655         devs.push(create_vhost_user_mac80211_hwsim_device(
656             cfg.protection_type,
657             vhost_user_mac80211_hwsim,
658         )?);
659     }
660 
661     #[cfg(feature = "audio")]
662     if let Some(path) = &cfg.sound {
663         devs.push(create_sound_device(
664             path,
665             cfg.protection_type,
666             &cfg.jail_config,
667         )?);
668     }
669 
670     Ok(devs)
671 }
672 
create_devices( cfg: &Config, vm: &mut impl Vm, resources: &mut SystemAllocator, vm_evt_wrtube: &SendTube, iommu_attached_endpoints: &mut BTreeMap<u32, Arc<Mutex<Box<dyn MemoryMapperTrait>>>>, irq_control_tubes: &mut Vec<Tube>, control_tubes: &mut Vec<TaggedControlTube>, #[cfg(feature = "balloon")] balloon_device_tube: Option<Tube>, #[cfg(feature = "balloon")] balloon_wss_device_tube: Option<Tube>, #[cfg(feature = "balloon")] init_balloon_size: u64, disk_device_tubes: &mut Vec<Tube>, pmem_device_tubes: &mut Vec<Tube>, fs_device_tubes: &mut Vec<Tube>, #[cfg(feature = "usb")] usb_provider: HostBackendDeviceProvider, #[cfg(feature = "gpu")] gpu_control_tube: Tube, #[cfg(feature = "gpu")] render_server_fd: Option<SafeDescriptor>, vvu_proxy_device_tubes: &mut Vec<Tube>, vvu_proxy_max_sibling_mem_size: u64, iova_max_addr: &mut Option<u64>, registered_evt_q: &SendTube, ) -> DeviceResult<Vec<(Box<dyn BusDeviceObj>, Option<Minijail>)>>673 fn create_devices(
674     cfg: &Config,
675     vm: &mut impl Vm,
676     resources: &mut SystemAllocator,
677     vm_evt_wrtube: &SendTube,
678     iommu_attached_endpoints: &mut BTreeMap<u32, Arc<Mutex<Box<dyn MemoryMapperTrait>>>>,
679     irq_control_tubes: &mut Vec<Tube>,
680     control_tubes: &mut Vec<TaggedControlTube>,
681     #[cfg(feature = "balloon")] balloon_device_tube: Option<Tube>,
682     #[cfg(feature = "balloon")] balloon_wss_device_tube: Option<Tube>,
683     #[cfg(feature = "balloon")] init_balloon_size: u64,
684     disk_device_tubes: &mut Vec<Tube>,
685     pmem_device_tubes: &mut Vec<Tube>,
686     fs_device_tubes: &mut Vec<Tube>,
687     #[cfg(feature = "usb")] usb_provider: HostBackendDeviceProvider,
688     #[cfg(feature = "gpu")] gpu_control_tube: Tube,
689     #[cfg(feature = "gpu")] render_server_fd: Option<SafeDescriptor>,
690     vvu_proxy_device_tubes: &mut Vec<Tube>,
691     vvu_proxy_max_sibling_mem_size: u64,
692     iova_max_addr: &mut Option<u64>,
693     registered_evt_q: &SendTube,
694 ) -> DeviceResult<Vec<(Box<dyn BusDeviceObj>, Option<Minijail>)>> {
695     let mut devices: Vec<(Box<dyn BusDeviceObj>, Option<Minijail>)> = Vec::new();
696     #[cfg(feature = "balloon")]
697     let mut balloon_inflate_tube: Option<Tube> = None;
698     if !cfg.vfio.is_empty() {
699         let mut coiommu_attached_endpoints = Vec::new();
700 
701         for vfio_dev in &cfg.vfio {
702             let (dev, jail, viommu_mapper) = create_vfio_device(
703                 &cfg.jail_config,
704                 vm,
705                 resources,
706                 irq_control_tubes,
707                 control_tubes,
708                 &vfio_dev.path,
709                 false,
710                 None,
711                 vfio_dev.guest_address,
712                 Some(&mut coiommu_attached_endpoints),
713                 vfio_dev.iommu,
714                 #[cfg(feature = "direct")]
715                 vfio_dev.intel_lpss,
716             )?;
717             match dev {
718                 VfioDeviceVariant::Pci(vfio_pci_device) => {
719                     *iova_max_addr = Some(max(
720                         vfio_pci_device.get_max_iova(),
721                         iova_max_addr.unwrap_or(0),
722                     ));
723 
724                     if let Some(viommu_mapper) = viommu_mapper {
725                         iommu_attached_endpoints.insert(
726                             vfio_pci_device
727                                 .pci_address()
728                                 .context("not initialized")?
729                                 .to_u32(),
730                             Arc::new(Mutex::new(Box::new(viommu_mapper))),
731                         );
732                     }
733 
734                     devices.push((Box::new(vfio_pci_device), jail));
735                 }
736                 VfioDeviceVariant::Platform(vfio_plat_dev) => {
737                     devices.push((Box::new(vfio_plat_dev), jail));
738                 }
739             }
740         }
741 
742         if !coiommu_attached_endpoints.is_empty() || !iommu_attached_endpoints.is_empty() {
743             let mut buf = mem::MaybeUninit::<libc::rlimit64>::zeroed();
744             let res = unsafe { libc::getrlimit64(libc::RLIMIT_MEMLOCK, buf.as_mut_ptr()) };
745             if res == 0 {
746                 let limit = unsafe { buf.assume_init() };
747                 let rlim_new = limit.rlim_cur.saturating_add(vm.get_memory().memory_size());
748                 let rlim_max = max(limit.rlim_max, rlim_new);
749                 if limit.rlim_cur < rlim_new {
750                     let limit_arg = libc::rlimit64 {
751                         rlim_cur: rlim_new,
752                         rlim_max,
753                     };
754                     let res = unsafe { libc::setrlimit64(libc::RLIMIT_MEMLOCK, &limit_arg) };
755                     if res != 0 {
756                         bail!("Set rlimit failed");
757                     }
758                 }
759             } else {
760                 bail!("Get rlimit failed");
761             }
762         }
763         #[cfg(feature = "balloon")]
764         let coiommu_tube: Option<Tube>;
765         #[cfg(not(feature = "balloon"))]
766         let coiommu_tube: Option<Tube> = None;
767         if !coiommu_attached_endpoints.is_empty() {
768             let vfio_container =
769                 VfioCommonSetup::vfio_get_container(IommuDevType::CoIommu, None as Option<&Path>)
770                     .context("failed to get vfio container")?;
771             let (coiommu_host_tube, coiommu_device_tube) =
772                 Tube::pair().context("failed to create coiommu tube")?;
773             control_tubes.push(TaggedControlTube::VmMemory {
774                 tube: coiommu_host_tube,
775                 expose_with_viommu: false,
776             });
777             let vcpu_count = cfg.vcpu_count.unwrap_or(1) as u64;
778             #[cfg(feature = "balloon")]
779             match Tube::pair() {
780                 Ok((x, y)) => {
781                     coiommu_tube = Some(x);
782                     balloon_inflate_tube = Some(y);
783                 }
784                 Err(x) => return Err(x).context("failed to create coiommu tube"),
785             }
786             let dev = CoIommuDev::new(
787                 vm.get_memory().clone(),
788                 vfio_container,
789                 coiommu_device_tube,
790                 coiommu_tube,
791                 coiommu_attached_endpoints,
792                 vcpu_count,
793                 cfg.coiommu_param.unwrap_or_default(),
794             )
795             .context("failed to create coiommu device")?;
796 
797             devices.push((
798                 Box::new(dev),
799                 simple_jail(&cfg.jail_config, "coiommu_device")?,
800             ));
801         }
802     }
803 
804     let stubs = create_virtio_devices(
805         cfg,
806         vm,
807         resources,
808         vm_evt_wrtube,
809         #[cfg(feature = "balloon")]
810         balloon_device_tube,
811         #[cfg(feature = "balloon")]
812         balloon_wss_device_tube,
813         #[cfg(feature = "balloon")]
814         balloon_inflate_tube,
815         #[cfg(feature = "balloon")]
816         init_balloon_size,
817         disk_device_tubes,
818         pmem_device_tubes,
819         fs_device_tubes,
820         #[cfg(feature = "gpu")]
821         gpu_control_tube,
822         #[cfg(feature = "gpu")]
823         render_server_fd,
824         vvu_proxy_device_tubes,
825         vvu_proxy_max_sibling_mem_size,
826         registered_evt_q,
827     )?;
828 
829     for stub in stubs {
830         match stub.dev.transport_type() {
831             VirtioTransportType::Pci => {
832                 let (msi_host_tube, msi_device_tube) =
833                     Tube::pair().context("failed to create tube")?;
834                 irq_control_tubes.push(msi_host_tube);
835 
836                 let shared_memory_tube = if stub.dev.get_shared_memory_region().is_some() {
837                     let (host_tube, device_tube) =
838                         Tube::pair().context("failed to create VVU proxy tube")?;
839                     control_tubes.push(TaggedControlTube::VmMemory {
840                         tube: host_tube,
841                         expose_with_viommu: stub.dev.expose_shmem_descriptors_with_viommu(),
842                     });
843                     Some(device_tube)
844                 } else {
845                     None
846                 };
847 
848                 let (ioevent_host_tube, ioevent_device_tube) =
849                     Tube::pair().context("failed to create ioevent tube")?;
850                 control_tubes.push(TaggedControlTube::VmMemory {
851                     tube: ioevent_host_tube,
852                     expose_with_viommu: false,
853                 });
854 
855                 let dev = VirtioPciDevice::new(
856                     vm.get_memory().clone(),
857                     stub.dev,
858                     msi_device_tube,
859                     cfg.disable_virtio_intx,
860                     shared_memory_tube,
861                     ioevent_device_tube,
862                 )
863                 .context("failed to create virtio pci dev")?;
864 
865                 devices.push((Box::new(dev) as Box<dyn BusDeviceObj>, stub.jail));
866             }
867             VirtioTransportType::Mmio => {
868                 let dev = VirtioMmioDevice::new(vm.get_memory().clone(), stub.dev, false)
869                     .context("failed to create virtio mmio dev")?;
870                 devices.push((Box::new(dev) as Box<dyn BusDeviceObj>, stub.jail));
871             }
872         }
873     }
874 
875     #[cfg(feature = "audio")]
876     for ac97_param in &cfg.ac97_parameters {
877         let dev = Ac97Dev::try_new(vm.get_memory().clone(), ac97_param.clone())
878             .context("failed to create ac97 device")?;
879         let jail = simple_jail(&cfg.jail_config, dev.minijail_policy())?;
880         devices.push((Box::new(dev), jail));
881     }
882 
883     #[cfg(feature = "usb")]
884     if cfg.usb {
885         // Create xhci controller.
886         let usb_controller = Box::new(XhciController::new(vm.get_memory().clone(), usb_provider));
887         devices.push((
888             usb_controller,
889             simple_jail(&cfg.jail_config, "xhci_device")?,
890         ));
891     }
892 
893     for params in &cfg.stub_pci_devices {
894         // Stub devices don't need jailing since they don't do anything.
895         devices.push((Box::new(StubPciDevice::new(params)), None));
896     }
897 
898     devices.push((
899         Box::new(PvPanicPciDevice::new(vm_evt_wrtube.try_clone()?)),
900         None,
901     ));
902 
903     Ok(devices)
904 }
905 
create_file_backed_mappings( cfg: &Config, vm: &mut impl Vm, resources: &mut SystemAllocator, ) -> Result<()>906 fn create_file_backed_mappings(
907     cfg: &Config,
908     vm: &mut impl Vm,
909     resources: &mut SystemAllocator,
910 ) -> Result<()> {
911     for mapping in &cfg.file_backed_mappings {
912         let file = OpenOptions::new()
913             .read(true)
914             .write(mapping.writable)
915             .custom_flags(if mapping.sync { libc::O_SYNC } else { 0 })
916             .open(&mapping.path)
917             .context("failed to open file for file-backed mapping")?;
918         let prot = if mapping.writable {
919             Protection::read_write()
920         } else {
921             Protection::read()
922         };
923         let size = mapping
924             .size
925             .try_into()
926             .context("Invalid size for file-backed mapping")?;
927         let memory_mapping = MemoryMappingBuilder::new(size)
928             .from_file(&file)
929             .offset(mapping.offset)
930             .protection(prot)
931             .build()
932             .context("failed to map backing file for file-backed mapping")?;
933 
934         let mapping_range = AddressRange::from_start_and_size(mapping.address, mapping.size)
935             .context("failed to convert to AddressRange")?;
936         match resources.mmio_allocator_any().allocate_at(
937             mapping_range,
938             Alloc::FileBacked(mapping.address),
939             "file-backed mapping".to_owned(),
940         ) {
941             // OutOfSpace just means that this mapping is not in the MMIO regions at all, so don't
942             // consider it an error.
943             // TODO(b/222769529): Reserve this region in a global memory address space allocator once
944             // we have that so nothing else can accidentally overlap with it.
945             Ok(()) | Err(resources::Error::OutOfSpace) => {}
946             e => e.context("failed to allocate guest address for file-backed mapping")?,
947         }
948 
949         vm.add_memory_region(
950             GuestAddress(mapping.address),
951             Box::new(memory_mapping),
952             !mapping.writable,
953             /* log_dirty_pages = */ false,
954         )
955         .context("failed to configure file-backed mapping")?;
956     }
957 
958     Ok(())
959 }
960 
961 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
create_pcie_root_port( host_pcie_rp: Vec<HostPcieRootPortParameters>, sys_allocator: &mut SystemAllocator, irq_control_tubes: &mut Vec<Tube>, control_tubes: &mut Vec<TaggedControlTube>, devices: &mut Vec<(Box<dyn BusDeviceObj>, Option<Minijail>)>, hp_vec: &mut Vec<(u8, Arc<Mutex<dyn HotPlugBus>>)>, hp_endpoints_ranges: &mut Vec<RangeInclusive<u32>>, #[allow(clippy::ptr_arg)] gpe_notify_devs: &mut Vec<(u32, Arc<Mutex<dyn GpeNotify>>)>, #[allow(clippy::ptr_arg)] pme_notify_devs: &mut Vec<(u8, Arc<Mutex<dyn PmeNotify>>)>, ) -> Result<()>962 fn create_pcie_root_port(
963     host_pcie_rp: Vec<HostPcieRootPortParameters>,
964     sys_allocator: &mut SystemAllocator,
965     irq_control_tubes: &mut Vec<Tube>,
966     control_tubes: &mut Vec<TaggedControlTube>,
967     devices: &mut Vec<(Box<dyn BusDeviceObj>, Option<Minijail>)>,
968     hp_vec: &mut Vec<(u8, Arc<Mutex<dyn HotPlugBus>>)>,
969     hp_endpoints_ranges: &mut Vec<RangeInclusive<u32>>,
970     // TODO(b/228627457): clippy is incorrectly warning about this Vec, which needs to be a Vec so
971     // we can push into it
972     #[allow(clippy::ptr_arg)] gpe_notify_devs: &mut Vec<(u32, Arc<Mutex<dyn GpeNotify>>)>,
973     #[allow(clippy::ptr_arg)] pme_notify_devs: &mut Vec<(u8, Arc<Mutex<dyn PmeNotify>>)>,
974 ) -> Result<()> {
975     if host_pcie_rp.is_empty() {
976         // user doesn't specify host pcie root port which link to this virtual pcie rp,
977         // find the empty bus and create a total virtual pcie rp
978         let mut hp_sec_bus = 0u8;
979         // Create Pcie Root Port for non-root buses, each non-root bus device will be
980         // connected behind a virtual pcie root port.
981         for i in 1..255 {
982             if sys_allocator.pci_bus_empty(i) {
983                 if hp_sec_bus == 0 {
984                     hp_sec_bus = i;
985                 }
986                 continue;
987             }
988             let pcie_root_port = Arc::new(Mutex::new(PcieRootPort::new(i, false)));
989             pme_notify_devs.push((i, pcie_root_port.clone() as Arc<Mutex<dyn PmeNotify>>));
990             let (msi_host_tube, msi_device_tube) = Tube::pair().context("failed to create tube")?;
991             irq_control_tubes.push(msi_host_tube);
992             let pci_bridge = Box::new(PciBridge::new(pcie_root_port.clone(), msi_device_tube));
993             // no ipc is used if the root port disables hotplug
994             devices.push((pci_bridge, None));
995         }
996 
997         // Create Pcie Root Port for hot-plug
998         if hp_sec_bus == 0 {
999             return Err(anyhow!("no more addresses are available"));
1000         }
1001         let pcie_root_port = Arc::new(Mutex::new(PcieRootPort::new(hp_sec_bus, true)));
1002         pme_notify_devs.push((
1003             hp_sec_bus,
1004             pcie_root_port.clone() as Arc<Mutex<dyn PmeNotify>>,
1005         ));
1006         let (msi_host_tube, msi_device_tube) = Tube::pair().context("failed to create tube")?;
1007         irq_control_tubes.push(msi_host_tube);
1008         let pci_bridge = Box::new(PciBridge::new(pcie_root_port.clone(), msi_device_tube));
1009 
1010         hp_endpoints_ranges.push(RangeInclusive::new(
1011             PciAddress {
1012                 bus: pci_bridge.get_secondary_num(),
1013                 dev: 0,
1014                 func: 0,
1015             }
1016             .to_u32(),
1017             PciAddress {
1018                 bus: pci_bridge.get_subordinate_num(),
1019                 dev: 32,
1020                 func: 8,
1021             }
1022             .to_u32(),
1023         ));
1024 
1025         devices.push((pci_bridge, None));
1026         hp_vec.push((hp_sec_bus, pcie_root_port as Arc<Mutex<dyn HotPlugBus>>));
1027     } else {
1028         // user specify host pcie root port which link to this virtual pcie rp,
1029         // reserve the host pci BDF and create a virtual pcie RP with some attrs same as host
1030         for host_pcie in host_pcie_rp.iter() {
1031             let (vm_host_tube, vm_device_tube) = Tube::pair().context("failed to create tube")?;
1032             let pcie_host = PcieHostPort::new(host_pcie.host_path.as_path(), vm_device_tube)?;
1033             let bus_range = pcie_host.get_bus_range();
1034             let mut slot_implemented = true;
1035             for i in bus_range.secondary..=bus_range.subordinate {
1036                 // if this bus is occupied by one vfio-pci device, this vfio-pci device is
1037                 // connected to a pci bridge on host statically, then it should be connected
1038                 // to a virtual pci bridge in guest statically, this bridge won't have
1039                 // hotplug capability and won't use slot.
1040                 if !sys_allocator.pci_bus_empty(i) {
1041                     slot_implemented = false;
1042                     break;
1043                 }
1044             }
1045 
1046             let pcie_root_port = Arc::new(Mutex::new(PcieRootPort::new_from_host(
1047                 pcie_host,
1048                 slot_implemented,
1049             )?));
1050             control_tubes.push(TaggedControlTube::Vm(vm_host_tube));
1051 
1052             let (msi_host_tube, msi_device_tube) = Tube::pair().context("failed to create tube")?;
1053             irq_control_tubes.push(msi_host_tube);
1054             let mut pci_bridge = Box::new(PciBridge::new(pcie_root_port.clone(), msi_device_tube));
1055             // early reservation for host pcie root port devices.
1056             let rootport_addr = pci_bridge.allocate_address(sys_allocator);
1057             if rootport_addr.is_err() {
1058                 warn!(
1059                     "address reservation failed for hot pcie root port {}",
1060                     pci_bridge.debug_label()
1061                 );
1062             }
1063 
1064             // Only append the sub pci range of a hot-pluggable root port to virtio-iommu
1065             if slot_implemented {
1066                 hp_endpoints_ranges.push(RangeInclusive::new(
1067                     PciAddress {
1068                         bus: pci_bridge.get_secondary_num(),
1069                         dev: 0,
1070                         func: 0,
1071                     }
1072                     .to_u32(),
1073                     PciAddress {
1074                         bus: pci_bridge.get_subordinate_num(),
1075                         dev: 32,
1076                         func: 8,
1077                     }
1078                     .to_u32(),
1079                 ));
1080             }
1081 
1082             devices.push((pci_bridge, None));
1083             if slot_implemented {
1084                 if let Some(gpe) = host_pcie.hp_gpe {
1085                     gpe_notify_devs
1086                         .push((gpe, pcie_root_port.clone() as Arc<Mutex<dyn GpeNotify>>));
1087                 }
1088                 hp_vec.push((
1089                     bus_range.secondary,
1090                     pcie_root_port as Arc<Mutex<dyn HotPlugBus>>,
1091                 ));
1092             }
1093         }
1094     }
1095 
1096     Ok(())
1097 }
1098 
setup_vm_components(cfg: &Config) -> Result<VmComponents>1099 fn setup_vm_components(cfg: &Config) -> Result<VmComponents> {
1100     let initrd_image = if let Some(initrd_path) = &cfg.initrd_path {
1101         Some(
1102             open_file(initrd_path, OpenOptions::new().read(true))
1103                 .with_context(|| format!("failed to open initrd {}", initrd_path.display()))?,
1104         )
1105     } else {
1106         None
1107     };
1108     let pvm_fw_image = if let Some(pvm_fw_path) = &cfg.pvm_fw {
1109         Some(
1110             open_file(pvm_fw_path, OpenOptions::new().read(true))
1111                 .with_context(|| format!("failed to open pvm_fw {}", pvm_fw_path.display()))?,
1112         )
1113     } else {
1114         None
1115     };
1116 
1117     let vm_image = match cfg.executable_path {
1118         Some(Executable::Kernel(ref kernel_path)) => VmImage::Kernel(
1119             open_file(kernel_path, OpenOptions::new().read(true)).with_context(|| {
1120                 format!("failed to open kernel image {}", kernel_path.display())
1121             })?,
1122         ),
1123         Some(Executable::Bios(ref bios_path)) => VmImage::Bios(
1124             open_file(bios_path, OpenOptions::new().read(true))
1125                 .with_context(|| format!("failed to open bios {}", bios_path.display()))?,
1126         ),
1127         _ => panic!("Did not receive a bios or kernel, should be impossible."),
1128     };
1129 
1130     let swiotlb = if let Some(size) = cfg.swiotlb {
1131         Some(
1132             size.checked_mul(1024 * 1024)
1133                 .ok_or_else(|| anyhow!("requested swiotlb size too large"))?,
1134         )
1135     } else if matches!(cfg.protection_type, ProtectionType::Unprotected) {
1136         None
1137     } else {
1138         Some(64 * 1024 * 1024)
1139     };
1140 
1141     let (pflash_image, pflash_block_size) = if let Some(pflash_parameters) = &cfg.pflash_parameters
1142     {
1143         (
1144             Some(
1145                 open_file(
1146                     &pflash_parameters.path,
1147                     OpenOptions::new().read(true).write(true),
1148                 )
1149                 .with_context(|| {
1150                     format!("failed to open pflash {}", pflash_parameters.path.display())
1151                 })?,
1152             ),
1153             pflash_parameters.block_size,
1154         )
1155     } else {
1156         (None, 0)
1157     };
1158 
1159     Ok(VmComponents {
1160         #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
1161         ac_adapter: cfg.ac_adapter,
1162         memory_size: cfg
1163             .memory
1164             .unwrap_or(256)
1165             .checked_mul(1024 * 1024)
1166             .ok_or_else(|| anyhow!("requested memory size too large"))?,
1167         swiotlb,
1168         vcpu_count: cfg.vcpu_count.unwrap_or(1),
1169         vcpu_affinity: cfg.vcpu_affinity.clone(),
1170         cpu_clusters: cfg.cpu_clusters.clone(),
1171         cpu_capacity: cfg.cpu_capacity.clone(),
1172         #[cfg(feature = "direct")]
1173         direct_gpe: cfg.direct_gpe.clone(),
1174         #[cfg(feature = "direct")]
1175         direct_fixed_evts: cfg.direct_fixed_evts.clone(),
1176         no_smt: cfg.no_smt,
1177         hugepages: cfg.hugepages,
1178         hv_cfg: hypervisor::Config {
1179             #[cfg(target_arch = "aarch64")]
1180             mte: cfg.mte,
1181             protection_type: cfg.protection_type,
1182         },
1183         vm_image,
1184         android_fstab: cfg
1185             .android_fstab
1186             .as_ref()
1187             .map(|x| {
1188                 File::open(x)
1189                     .with_context(|| format!("failed to open android fstab file {}", x.display()))
1190             })
1191             .map_or(Ok(None), |v| v.map(Some))?,
1192         pstore: cfg.pstore.clone(),
1193         pflash_block_size,
1194         pflash_image,
1195         initrd_image,
1196         extra_kernel_params: cfg.params.clone(),
1197         acpi_sdts: cfg
1198             .acpi_tables
1199             .iter()
1200             .map(|path| {
1201                 SDT::from_file(path)
1202                     .with_context(|| format!("failed to open ACPI file {}", path.display()))
1203             })
1204             .collect::<Result<Vec<SDT>>>()?,
1205         rt_cpus: cfg.rt_cpus.clone(),
1206         delay_rt: cfg.delay_rt,
1207         #[cfg(all(any(target_arch = "x86_64", target_arch = "aarch64"), feature = "gdb"))]
1208         gdb: None,
1209         dmi_path: cfg.dmi_path.clone(),
1210         no_i8042: cfg.no_i8042,
1211         no_rtc: cfg.no_rtc,
1212         #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
1213         oem_strings: cfg.oem_strings.clone(),
1214         host_cpu_topology: cfg.host_cpu_topology,
1215         itmt: cfg.itmt,
1216         #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
1217         force_s2idle: cfg.force_s2idle,
1218         pvm_fw: pvm_fw_image,
1219         #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
1220         pcie_ecam: cfg.pcie_ecam,
1221         #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
1222         pci_low_start: cfg.pci_low_start,
1223     })
1224 }
1225 
1226 #[derive(Copy, Clone, Debug, Eq, PartialEq)]
1227 pub enum ExitState {
1228     Reset,
1229     Stop,
1230     Crash,
1231     GuestPanic,
1232     WatchdogReset,
1233 }
1234 // Remove ranges in `guest_mem_layout` that overlap with ranges in `file_backed_mappings`.
1235 // Returns the updated guest memory layout.
punch_holes_in_guest_mem_layout_for_mappings( guest_mem_layout: Vec<(GuestAddress, u64, MemoryRegionOptions)>, file_backed_mappings: &[FileBackedMappingParameters], ) -> Vec<(GuestAddress, u64, MemoryRegionOptions)>1236 fn punch_holes_in_guest_mem_layout_for_mappings(
1237     guest_mem_layout: Vec<(GuestAddress, u64, MemoryRegionOptions)>,
1238     file_backed_mappings: &[FileBackedMappingParameters],
1239 ) -> Vec<(GuestAddress, u64, MemoryRegionOptions)> {
1240     // Create a set containing (start, end) pairs with exclusive end (end = start + size; the byte
1241     // at end is not included in the range).
1242     let mut layout_set = BTreeSet::new();
1243     for (addr, size, options) in &guest_mem_layout {
1244         layout_set.insert((addr.offset(), addr.offset() + size, *options));
1245     }
1246 
1247     for mapping in file_backed_mappings {
1248         let mapping_start = mapping.address;
1249         let mapping_end = mapping_start + mapping.size;
1250 
1251         // Repeatedly split overlapping guest memory regions until no overlaps remain.
1252         while let Some((range_start, range_end, options)) = layout_set
1253             .iter()
1254             .find(|&&(range_start, range_end, _)| {
1255                 mapping_start < range_end && mapping_end > range_start
1256             })
1257             .cloned()
1258         {
1259             layout_set.remove(&(range_start, range_end, options));
1260 
1261             if range_start < mapping_start {
1262                 layout_set.insert((range_start, mapping_start, options));
1263             }
1264             if range_end > mapping_end {
1265                 layout_set.insert((mapping_end, range_end, options));
1266             }
1267         }
1268     }
1269 
1270     // Build the final guest memory layout from the modified layout_set.
1271     layout_set
1272         .iter()
1273         .map(|(start, end, options)| (GuestAddress(*start), end - start, *options))
1274         .collect()
1275 }
1276 
create_guest_memory( cfg: &Config, components: &VmComponents, hypervisor: &impl Hypervisor, ) -> Result<GuestMemory>1277 fn create_guest_memory(
1278     cfg: &Config,
1279     components: &VmComponents,
1280     hypervisor: &impl Hypervisor,
1281 ) -> Result<GuestMemory> {
1282     let guest_mem_layout = Arch::guest_memory_layout(components, hypervisor)
1283         .context("failed to create guest memory layout")?;
1284 
1285     let guest_mem_layout =
1286         punch_holes_in_guest_mem_layout_for_mappings(guest_mem_layout, &cfg.file_backed_mappings);
1287 
1288     let guest_mem = GuestMemory::new_with_options(&guest_mem_layout)
1289         .context("failed to create guest memory")?;
1290     let mut mem_policy = MemoryPolicy::empty();
1291     if components.hugepages {
1292         mem_policy |= MemoryPolicy::USE_HUGEPAGES;
1293     }
1294 
1295     if cfg.lock_guest_memory {
1296         mem_policy |= MemoryPolicy::LOCK_GUEST_MEMORY;
1297     }
1298     guest_mem.set_memory_policy(mem_policy);
1299 
1300     if cfg.unmap_guest_memory_on_fork {
1301         // Note that this isn't compatible with sandboxing. We could potentially fix that by
1302         // delaying the call until after the sandboxed devices are forked. However, the main use
1303         // for this is in conjunction with protected VMs, where most of the guest memory has been
1304         // unshared with the host. We'd need to be confident that the guest memory is unshared with
1305         // the host only after the `use_dontfork` call and those details will vary by hypervisor.
1306         // So, for now we keep things simple to be safe.
1307         guest_mem.use_dontfork().context("use_dontfork failed")?;
1308     }
1309 
1310     Ok(guest_mem)
1311 }
1312 
1313 #[cfg(any(target_arch = "aarch64"))]
1314 #[cfg(feature = "geniezone")]
run_gz(device_path: Option<&Path>, cfg: Config, components: VmComponents) -> Result<ExitState>1315 fn run_gz(device_path: Option<&Path>, cfg: Config, components: VmComponents) -> Result<ExitState> {
1316     let device_path = device_path.unwrap_or(Path::new(GENIEZONE_PATH));
1317     let gzvm = Geniezone::new_with_path(device_path)
1318         .with_context(|| format!("failed to open GenieZone device {}", device_path.display()))?;
1319 
1320     let guest_mem = create_guest_memory(&cfg, &components, &gzvm)?;
1321 
1322     #[cfg(feature = "swap")]
1323     let swap_controller = if let Some(swap_dir) = cfg.swap_dir.as_ref() {
1324         Some(
1325             SwapController::launch(guest_mem.clone(), swap_dir, &cfg.jail_config)
1326                 .context("launch vmm-swap monitor process")?,
1327         )
1328     } else {
1329         None
1330     };
1331 
1332     let vm =
1333         GeniezoneVm::new(&gzvm, guest_mem, components.hv_cfg).context("failed to create vm")?;
1334 
1335     // Check that the VM was actually created in protected mode as expected.
1336     if cfg.protection_type.isolates_memory() && !vm.check_capability(VmCap::Protected) {
1337         bail!("Failed to create protected VM");
1338     }
1339     let vm_clone = vm.try_clone().context("failed to clone vm")?;
1340 
1341     let ioapic_host_tube;
1342     let mut irq_chip = if cfg.split_irqchip {
1343         unimplemented!("Geniezone does not support split irqchip mode");
1344     } else {
1345         ioapic_host_tube = None;
1346 
1347         GeniezoneKernelIrqChip::new(vm_clone, components.vcpu_count)
1348             .context("failed to create IRQ chip")?
1349     };
1350 
1351     run_vm::<GeniezoneVcpu, GeniezoneVm>(
1352         cfg,
1353         components,
1354         vm,
1355         &mut irq_chip,
1356         ioapic_host_tube,
1357         #[cfg(feature = "swap")]
1358         swap_controller,
1359     )
1360 }
1361 
run_kvm(device_path: Option<&Path>, cfg: Config, components: VmComponents) -> Result<ExitState>1362 fn run_kvm(device_path: Option<&Path>, cfg: Config, components: VmComponents) -> Result<ExitState> {
1363     let device_path = device_path.unwrap_or(Path::new(KVM_PATH));
1364     let kvm = Kvm::new_with_path(device_path)
1365         .with_context(|| format!("failed to open KVM device {}", device_path.display()))?;
1366 
1367     let guest_mem = create_guest_memory(&cfg, &components, &kvm)?;
1368 
1369     #[cfg(feature = "swap")]
1370     let swap_controller = if let Some(swap_dir) = cfg.swap_dir.as_ref() {
1371         Some(
1372             SwapController::launch(guest_mem.clone(), swap_dir, &cfg.jail_config)
1373                 .context("launch vmm-swap monitor process")?,
1374         )
1375     } else {
1376         None
1377     };
1378 
1379     let vm = KvmVm::new(&kvm, guest_mem, components.hv_cfg).context("failed to create vm")?;
1380 
1381     #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
1382     if cfg.itmt {
1383         vm.set_platform_info_read_access(false)
1384             .context("failed to disable MSR_PLATFORM_INFO read access")?;
1385     }
1386 
1387     if !cfg.userspace_msr.is_empty() {
1388         vm.enable_userspace_msr()
1389             .context("failed to enable userspace MSR handling, do you have kernel 5.10 or later")?;
1390         #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
1391         {
1392             let msr_list = get_override_msr_list(&cfg.userspace_msr);
1393             vm.set_msr_filter(msr_list)
1394                 .context("failed to set msr filter")?;
1395         }
1396     }
1397 
1398     // Check that the VM was actually created in protected mode as expected.
1399     if cfg.protection_type.isolates_memory() && !vm.check_capability(VmCap::Protected) {
1400         bail!("Failed to create protected VM");
1401     }
1402     let vm_clone = vm.try_clone().context("failed to clone vm")?;
1403 
1404     enum KvmIrqChip {
1405         #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
1406         Split(KvmSplitIrqChip),
1407         Kernel(KvmKernelIrqChip),
1408     }
1409 
1410     impl KvmIrqChip {
1411         fn as_mut(&mut self) -> &mut dyn IrqChipArch {
1412             match self {
1413                 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
1414                 KvmIrqChip::Split(i) => i,
1415                 KvmIrqChip::Kernel(i) => i,
1416             }
1417         }
1418     }
1419 
1420     let ioapic_host_tube;
1421     let mut irq_chip = if cfg.split_irqchip {
1422         #[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))]
1423         unimplemented!("KVM split irqchip mode only supported on x86 processors");
1424         #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
1425         {
1426             let (host_tube, ioapic_device_tube) = Tube::pair().context("failed to create tube")?;
1427             ioapic_host_tube = Some(host_tube);
1428             KvmIrqChip::Split(
1429                 KvmSplitIrqChip::new(
1430                     vm_clone,
1431                     components.vcpu_count,
1432                     ioapic_device_tube,
1433                     Some(120),
1434                 )
1435                 .context("failed to create IRQ chip")?,
1436             )
1437         }
1438     } else {
1439         ioapic_host_tube = None;
1440         KvmIrqChip::Kernel(
1441             KvmKernelIrqChip::new(vm_clone, components.vcpu_count)
1442                 .context("failed to create IRQ chip")?,
1443         )
1444     };
1445 
1446     run_vm::<KvmVcpu, KvmVm>(
1447         cfg,
1448         components,
1449         vm,
1450         irq_chip.as_mut(),
1451         ioapic_host_tube,
1452         #[cfg(feature = "swap")]
1453         swap_controller,
1454     )
1455 }
1456 
1457 #[cfg(all(any(target_arch = "arm", target_arch = "aarch64"), feature = "gunyah"))]
run_gunyah( device_path: Option<&Path>, cfg: Config, components: VmComponents, ) -> Result<ExitState>1458 fn run_gunyah(
1459     device_path: Option<&Path>,
1460     cfg: Config,
1461     components: VmComponents,
1462 ) -> Result<ExitState> {
1463     use devices::GunyahIrqChip;
1464     use hypervisor::gunyah::{Gunyah, GunyahVcpu, GunyahVm};
1465 
1466     let device_path = device_path.unwrap_or(Path::new(GUNYAH_PATH));
1467     let gunyah = Gunyah::new_with_path(device_path)
1468         .with_context(|| format!("failed to open Gunyah device {}", device_path.display()))?;
1469 
1470     let guest_mem = create_guest_memory(&cfg, &components, &gunyah)?;
1471 
1472     #[cfg(feature = "swap")]
1473     let swap_controller = if let Some(swap_dir) = cfg.swap_dir.as_ref() {
1474         Some(
1475             SwapController::launch(guest_mem.clone(), swap_dir, &cfg.jail_config)
1476                 .context("launch vmm-swap monitor process")?,
1477         )
1478     } else {
1479         None
1480     };
1481 
1482     let vm = GunyahVm::new(&gunyah, guest_mem, components.hv_cfg).context("failed to create vm")?;
1483 
1484     // Check that the VM was actually created in protected mode as expected.
1485     if cfg.protection_type.isolates_memory() && !vm.check_capability(VmCap::Protected) {
1486         bail!("Failed to create protected VM");
1487     }
1488 
1489     let vm_clone = vm.try_clone()?;
1490 
1491     run_vm::<GunyahVcpu, GunyahVm>(
1492         cfg,
1493         components,
1494         vm,
1495         &mut GunyahIrqChip::new(vm_clone)?,
1496         None,
1497         #[cfg(feature = "swap")]
1498         swap_controller,
1499     )
1500 }
1501 
1502 /// Choose a default hypervisor if no `--hypervisor` option was specified.
get_default_hypervisor() -> Option<HypervisorKind>1503 fn get_default_hypervisor() -> Option<HypervisorKind> {
1504     let kvm_path = Path::new(KVM_PATH);
1505     if kvm_path.exists() {
1506         return Some(HypervisorKind::Kvm {
1507             device: Some(kvm_path.to_path_buf()),
1508         });
1509     }
1510 
1511     #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
1512     #[cfg(feature = "geniezone")]
1513     {
1514         let gz_path = Path::new(GENIEZONE_PATH);
1515         if gz_path.exists() {
1516             return Some(HypervisorKind::Geniezone {
1517                 device: Some(gz_path.to_path_buf()),
1518             });
1519         }
1520     }
1521 
1522     #[cfg(all(
1523         unix,
1524         any(target_arch = "arm", target_arch = "aarch64"),
1525         feature = "gunyah"
1526     ))]
1527     {
1528         let gunyah_path = Path::new(GUNYAH_PATH);
1529         if gunyah_path.exists() {
1530             return Some(HypervisorKind::Gunyah {
1531                 device: Some(gunyah_path.to_path_buf()),
1532             });
1533         }
1534     }
1535 
1536     None
1537 }
1538 
run_config(cfg: Config) -> Result<ExitState>1539 pub fn run_config(cfg: Config) -> Result<ExitState> {
1540     if let Some(async_executor) = cfg.async_executor {
1541         Executor::set_default_executor_kind(async_executor)
1542             .context("Failed to set the default async executor")?;
1543     }
1544 
1545     let components = setup_vm_components(&cfg)?;
1546 
1547     let hypervisor = cfg
1548         .hypervisor
1549         .clone()
1550         .or_else(get_default_hypervisor)
1551         .context("no enabled hypervisor")?;
1552 
1553     debug!("creating hypervisor: {:?}", hypervisor);
1554 
1555     match hypervisor {
1556         HypervisorKind::Kvm { device } => run_kvm(device.as_deref(), cfg, components),
1557         #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
1558         #[cfg(feature = "geniezone")]
1559         HypervisorKind::Geniezone { device } => run_gz(device.as_deref(), cfg, components),
1560         #[cfg(all(
1561             unix,
1562             any(target_arch = "arm", target_arch = "aarch64"),
1563             feature = "gunyah"
1564         ))]
1565         HypervisorKind::Gunyah { device } => run_gunyah(device.as_deref(), cfg, components),
1566     }
1567 }
1568 
run_vm<Vcpu, V>( cfg: Config, #[allow(unused_mut)] mut components: VmComponents, mut vm: V, irq_chip: &mut dyn IrqChipArch, ioapic_host_tube: Option<Tube>, #[cfg(feature = "swap")] swap_controller: Option<SwapController>, ) -> Result<ExitState> where Vcpu: VcpuArch + 'static, V: VmArch + 'static,1569 fn run_vm<Vcpu, V>(
1570     cfg: Config,
1571     #[allow(unused_mut)] mut components: VmComponents,
1572     mut vm: V,
1573     irq_chip: &mut dyn IrqChipArch,
1574     ioapic_host_tube: Option<Tube>,
1575     #[cfg(feature = "swap")] swap_controller: Option<SwapController>,
1576 ) -> Result<ExitState>
1577 where
1578     Vcpu: VcpuArch + 'static,
1579     V: VmArch + 'static,
1580 {
1581     if cfg.jail_config.is_some() {
1582         // Printing something to the syslog before entering minijail so that libc's syslogger has a
1583         // chance to open files necessary for its operation, like `/etc/localtime`. After jailing,
1584         // access to those files will not be possible.
1585         info!("crosvm entering multiprocess mode");
1586     }
1587 
1588     #[cfg(feature = "gpu")]
1589     let (gpu_control_host_tube, gpu_control_device_tube) =
1590         Tube::pair().context("failed to create gpu tube")?;
1591 
1592     #[cfg(feature = "usb")]
1593     let (usb_control_tube, usb_provider) =
1594         HostBackendDeviceProvider::new().context("failed to create usb provider")?;
1595 
1596     // Masking signals is inherently dangerous, since this can persist across clones/execs. Do this
1597     // before any jailed devices have been spawned, so that we can catch any of them that fail very
1598     // quickly.
1599     let sigchld_fd = SignalFd::new(libc::SIGCHLD).context("failed to create signalfd")?;
1600 
1601     let control_server_socket = match &cfg.socket_path {
1602         Some(path) => Some(UnlinkUnixSeqpacketListener(
1603             UnixSeqpacketListener::bind(path).context("failed to create control server")?,
1604         )),
1605         None => None,
1606     };
1607 
1608     let mut control_tubes = Vec::new();
1609     let mut irq_control_tubes = Vec::new();
1610 
1611     #[cfg(all(any(target_arch = "x86_64", target_arch = "aarch64"), feature = "gdb"))]
1612     if let Some(port) = cfg.gdb {
1613         // GDB needs a control socket to interrupt vcpus.
1614         let (gdb_host_tube, gdb_control_tube) = Tube::pair().context("failed to create tube")?;
1615         control_tubes.push(TaggedControlTube::Vm(gdb_host_tube));
1616         components.gdb = Some((port, gdb_control_tube));
1617     }
1618 
1619     #[cfg(feature = "balloon")]
1620     let (balloon_host_tube, balloon_device_tube) = if cfg.balloon {
1621         if let Some(ref path) = cfg.balloon_control {
1622             (
1623                 None,
1624                 Some(Tube::new_from_unix_seqpacket(
1625                     UnixSeqpacket::connect(path).with_context(|| {
1626                         format!(
1627                             "failed to connect to balloon control socket {}",
1628                             path.display(),
1629                         )
1630                     })?,
1631                 )),
1632             )
1633         } else {
1634             // Balloon gets a special socket so balloon requests can be forwarded
1635             // from the main process.
1636             let (host, device) = Tube::pair().context("failed to create tube")?;
1637             // Set recv timeout to avoid deadlock on sending BalloonControlCommand
1638             // before the guest is ready.
1639             host.set_recv_timeout(Some(Duration::from_millis(100)))
1640                 .context("failed to set timeout")?;
1641             (Some(host), Some(device))
1642         }
1643     } else {
1644         (None, None)
1645     };
1646 
1647     #[cfg(feature = "balloon")]
1648     let (balloon_wss_host_tube, balloon_wss_device_tube) = if cfg.balloon_wss_reporting {
1649         let (host, device) = Tube::pair().context("failed to create tube")?;
1650         host.set_recv_timeout(Some(Duration::from_millis(100)))
1651             .context("failed to set timeout")?;
1652         (Some(host), Some(device))
1653     } else {
1654         (None, None)
1655     };
1656 
1657     // Create one control socket per disk.
1658     let mut disk_device_tubes = Vec::new();
1659     let mut disk_host_tubes = Vec::new();
1660     let disk_count = cfg.disks.len();
1661     for _ in 0..disk_count {
1662         let (disk_host_tub, disk_device_tube) = Tube::pair().context("failed to create tube")?;
1663         disk_host_tubes.push(disk_host_tub);
1664         disk_device_tubes.push(disk_device_tube);
1665     }
1666 
1667     let mut pmem_device_tubes = Vec::new();
1668     let pmem_count = cfg.pmem_devices.len();
1669     for _ in 0..pmem_count {
1670         let (pmem_host_tube, pmem_device_tube) = Tube::pair().context("failed to create tube")?;
1671         pmem_device_tubes.push(pmem_device_tube);
1672         control_tubes.push(TaggedControlTube::VmMsync(pmem_host_tube));
1673     }
1674 
1675     if let Some(ioapic_host_tube) = ioapic_host_tube {
1676         irq_control_tubes.push(ioapic_host_tube);
1677     }
1678 
1679     let battery = if cfg.battery_config.is_some() {
1680         #[cfg_attr(
1681             not(feature = "power-monitor-powerd"),
1682             allow(clippy::manual_map, clippy::needless_match, unused_mut)
1683         )]
1684         let jail = if let Some(jail_config) = &cfg.jail_config {
1685             let mut config = SandboxConfig::new(jail_config, "battery");
1686             #[cfg(feature = "power-monitor-powerd")]
1687             {
1688                 config.bind_mounts = true;
1689             }
1690             let mut jail =
1691                 create_sandbox_minijail(&jail_config.pivot_root, MAX_OPEN_FILES_DEFAULT, &config)?;
1692 
1693             // Setup a bind mount to the system D-Bus socket if the powerd monitor is used.
1694             #[cfg(feature = "power-monitor-powerd")]
1695             {
1696                 let system_bus_socket_path = Path::new("/run/dbus/system_bus_socket");
1697                 jail.mount_bind(system_bus_socket_path, system_bus_socket_path, true)?;
1698             }
1699             Some(jail)
1700         } else {
1701             None
1702         };
1703         (cfg.battery_config.as_ref().map(|c| c.type_), jail)
1704     } else {
1705         (cfg.battery_config.as_ref().map(|c| c.type_), None)
1706     };
1707 
1708     let fs_count = cfg
1709         .shared_dirs
1710         .iter()
1711         .filter(|sd| sd.kind == SharedDirKind::FS)
1712         .count();
1713     let mut fs_device_tubes = Vec::with_capacity(fs_count);
1714     for _ in 0..fs_count {
1715         let (fs_host_tube, fs_device_tube) = Tube::pair().context("failed to create tube")?;
1716         control_tubes.push(TaggedControlTube::Fs(fs_host_tube));
1717         fs_device_tubes.push(fs_device_tube);
1718     }
1719 
1720     let mut vvu_proxy_device_tubes = Vec::new();
1721     for _ in 0..cfg.vvu_proxy.len() {
1722         let (vvu_proxy_host_tube, vvu_proxy_device_tube) =
1723             Tube::pair().context("failed to create VVU proxy tube")?;
1724         control_tubes.push(TaggedControlTube::VmMemory {
1725             tube: vvu_proxy_host_tube,
1726             expose_with_viommu: false,
1727         });
1728         vvu_proxy_device_tubes.push(vvu_proxy_device_tube);
1729     }
1730 
1731     let (vm_evt_wrtube, vm_evt_rdtube) =
1732         Tube::directional_pair().context("failed to create vm event tube")?;
1733 
1734     let pstore_size = components.pstore.as_ref().map(|pstore| pstore.size as u64);
1735     let mut sys_allocator = SystemAllocator::new(
1736         Arch::get_system_allocator_config(&vm),
1737         pstore_size,
1738         &cfg.mmio_address_ranges,
1739     )
1740     .context("failed to create system allocator")?;
1741 
1742     let ramoops_region = match &components.pstore {
1743         Some(pstore) => Some(
1744             arch::pstore::create_memory_region(
1745                 &mut vm,
1746                 sys_allocator.reserved_region().unwrap(),
1747                 pstore,
1748             )
1749             .context("failed to allocate pstore region")?,
1750         ),
1751         None => None,
1752     };
1753 
1754     create_file_backed_mappings(&cfg, &mut vm, &mut sys_allocator)?;
1755 
1756     #[cfg(feature = "gpu")]
1757     // Hold on to the render server jail so it keeps running until we exit run_vm()
1758     let (_render_server_jail, render_server_fd) =
1759         if let Some(parameters) = &cfg.gpu_render_server_parameters {
1760             let (jail, fd) = start_gpu_render_server(&cfg, parameters)?;
1761             (Some(ScopedMinijail(jail)), Some(fd))
1762         } else {
1763             (None, None)
1764         };
1765 
1766     #[cfg(feature = "balloon")]
1767     let init_balloon_size = components
1768         .memory_size
1769         .checked_sub(cfg.init_memory.map_or(components.memory_size, |m| {
1770             m.checked_mul(1024 * 1024).unwrap_or(u64::MAX)
1771         }))
1772         .context("failed to calculate init balloon size")?;
1773 
1774     #[cfg(feature = "direct")]
1775     let mut irqs = Vec::new();
1776 
1777     #[cfg(feature = "direct")]
1778     for irq in &cfg.direct_level_irq {
1779         if !sys_allocator.reserve_irq(*irq) {
1780             warn!("irq {} already reserved.", irq);
1781         }
1782         use devices::CrosvmDeviceId;
1783         let irq_event_source = IrqEventSource {
1784             device_id: CrosvmDeviceId::DirectIo.into(),
1785             queue_id: 0,
1786             device_name: format!("direct edge irq {}", irq),
1787         };
1788         let irq_evt = devices::IrqLevelEvent::new().context("failed to create event")?;
1789         irq_chip
1790             .register_level_irq_event(*irq, &irq_evt, irq_event_source)
1791             .unwrap();
1792         let direct_irq = devices::DirectIrq::new_level(&irq_evt)
1793             .context("failed to enable interrupt forwarding")?;
1794         direct_irq
1795             .irq_enable(*irq)
1796             .context("failed to enable interrupt forwarding")?;
1797         irqs.push(direct_irq);
1798     }
1799 
1800     #[cfg(feature = "direct")]
1801     for irq in &cfg.direct_edge_irq {
1802         if !sys_allocator.reserve_irq(*irq) {
1803             warn!("irq {} already reserved.", irq);
1804         }
1805         use devices::CrosvmDeviceId;
1806         let irq_event_source = IrqEventSource {
1807             device_id: CrosvmDeviceId::DirectIo.into(),
1808             queue_id: 0,
1809             device_name: format!("direct level irq {}", irq),
1810         };
1811         let irq_evt = devices::IrqEdgeEvent::new().context("failed to create event")?;
1812         irq_chip
1813             .register_edge_irq_event(*irq, &irq_evt, irq_event_source)
1814             .unwrap();
1815         let direct_irq = devices::DirectIrq::new_edge(&irq_evt)
1816             .context("failed to enable interrupt forwarding")?;
1817         direct_irq
1818             .irq_enable(*irq)
1819             .context("failed to enable interrupt forwarding")?;
1820         irqs.push(direct_irq);
1821     }
1822 
1823     // Reserve direct mmio range in advance.
1824     #[cfg(feature = "direct")]
1825     if let Some(mmio) = &cfg.direct_mmio {
1826         for range in mmio.ranges.iter() {
1827             AddressRange::from_start_and_size(range.base, range.len)
1828                 .ok_or(ResourceError::OutOfSpace)
1829                 .and_then(|range| sys_allocator.reserve_mmio(range))
1830                 .with_context(|| {
1831                     format!(
1832                         "failed to reserved direct mmio: {:x}-{:x}",
1833                         range.base,
1834                         range.base + range.len - 1,
1835                     )
1836                 })?;
1837         }
1838     };
1839 
1840     let mut iommu_attached_endpoints: BTreeMap<u32, Arc<Mutex<Box<dyn MemoryMapperTrait>>>> =
1841         BTreeMap::new();
1842     let mut iova_max_addr: Option<u64> = None;
1843 
1844     let (reg_evt_wrtube, reg_evt_rdtube) =
1845         Tube::directional_pair().context("failed to create registered event tube")?;
1846 
1847     let mut devices = create_devices(
1848         &cfg,
1849         &mut vm,
1850         &mut sys_allocator,
1851         &vm_evt_wrtube,
1852         &mut iommu_attached_endpoints,
1853         &mut irq_control_tubes,
1854         &mut control_tubes,
1855         #[cfg(feature = "balloon")]
1856         balloon_device_tube,
1857         #[cfg(feature = "balloon")]
1858         balloon_wss_device_tube,
1859         #[cfg(feature = "balloon")]
1860         init_balloon_size,
1861         &mut disk_device_tubes,
1862         &mut pmem_device_tubes,
1863         &mut fs_device_tubes,
1864         #[cfg(feature = "usb")]
1865         usb_provider,
1866         #[cfg(feature = "gpu")]
1867         gpu_control_device_tube,
1868         #[cfg(feature = "gpu")]
1869         render_server_fd,
1870         &mut vvu_proxy_device_tubes,
1871         components.memory_size,
1872         &mut iova_max_addr,
1873         &reg_evt_wrtube,
1874     )?;
1875 
1876     #[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))]
1877     let hp_endpoints_ranges: Vec<RangeInclusive<u32>> = Vec::new();
1878     #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
1879     let mut hp_endpoints_ranges: Vec<RangeInclusive<u32>> = Vec::new();
1880     #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
1881     let mut hotplug_buses: Vec<(u8, Arc<Mutex<dyn HotPlugBus>>)> = Vec::new();
1882     #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
1883     let mut gpe_notify_devs: Vec<(u32, Arc<Mutex<dyn GpeNotify>>)> = Vec::new();
1884     #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
1885     let mut pme_notify_devs: Vec<(u8, Arc<Mutex<dyn PmeNotify>>)> = Vec::new();
1886     #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
1887     {
1888         #[cfg(feature = "direct")]
1889         let rp_host = cfg.pcie_rp.clone();
1890         #[cfg(not(feature = "direct"))]
1891         let rp_host: Vec<HostPcieRootPortParameters> = Vec::new();
1892 
1893         // Create Pcie Root Port
1894         create_pcie_root_port(
1895             rp_host,
1896             &mut sys_allocator,
1897             &mut irq_control_tubes,
1898             &mut control_tubes,
1899             &mut devices,
1900             &mut hotplug_buses,
1901             &mut hp_endpoints_ranges,
1902             &mut gpe_notify_devs,
1903             &mut pme_notify_devs,
1904         )?;
1905     }
1906 
1907     arch::assign_pci_addresses(&mut devices, &mut sys_allocator)?;
1908 
1909     let (translate_response_senders, request_rx) = setup_virtio_access_platform(
1910         &mut sys_allocator,
1911         &mut iommu_attached_endpoints,
1912         &mut devices,
1913     )?;
1914 
1915     let iommu_host_tube = if !iommu_attached_endpoints.is_empty()
1916         || (cfg.vfio_isolate_hotplug && !hp_endpoints_ranges.is_empty())
1917     {
1918         let (iommu_host_tube, iommu_device_tube) = Tube::pair().context("failed to create tube")?;
1919         let iommu_dev = create_iommu_device(
1920             cfg.protection_type,
1921             &cfg.jail_config,
1922             iova_max_addr.unwrap_or(u64::MAX),
1923             iommu_attached_endpoints,
1924             hp_endpoints_ranges,
1925             translate_response_senders,
1926             request_rx,
1927             iommu_device_tube,
1928         )?;
1929 
1930         let (msi_host_tube, msi_device_tube) = Tube::pair().context("failed to create tube")?;
1931         irq_control_tubes.push(msi_host_tube);
1932         let (ioevent_host_tube, ioevent_device_tube) =
1933             Tube::pair().context("failed to create ioevent tube")?;
1934         control_tubes.push(TaggedControlTube::VmMemory {
1935             tube: ioevent_host_tube,
1936             expose_with_viommu: false,
1937         });
1938         let mut dev = VirtioPciDevice::new(
1939             vm.get_memory().clone(),
1940             iommu_dev.dev,
1941             msi_device_tube,
1942             cfg.disable_virtio_intx,
1943             None,
1944             ioevent_device_tube,
1945         )
1946         .context("failed to create virtio pci dev")?;
1947         // early reservation for viommu.
1948         dev.allocate_address(&mut sys_allocator)
1949             .context("failed to allocate resources early for virtio pci dev")?;
1950         let dev = Box::new(dev);
1951         devices.push((dev, iommu_dev.jail));
1952         Some(iommu_host_tube)
1953     } else {
1954         None
1955     };
1956 
1957     #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
1958     for device in devices
1959         .iter_mut()
1960         .filter_map(|(dev, _)| dev.as_pci_device_mut())
1961     {
1962         let sdts = device
1963             .generate_acpi(components.acpi_sdts)
1964             .or_else(|| {
1965                 error!("ACPI table generation error");
1966                 None
1967             })
1968             .ok_or_else(|| anyhow!("failed to generate ACPI table"))?;
1969         components.acpi_sdts = sdts;
1970     }
1971 
1972     // KVM_CREATE_VCPU uses apic id for x86 and uses cpu id for others.
1973     let mut vcpu_ids = Vec::new();
1974 
1975     #[cfg_attr(not(feature = "direct"), allow(unused_mut))]
1976     let mut linux = Arch::build_vm::<V, Vcpu>(
1977         components,
1978         &vm_evt_wrtube,
1979         &mut sys_allocator,
1980         &cfg.serial_parameters,
1981         simple_jail(&cfg.jail_config, "serial_device")?,
1982         battery,
1983         vm,
1984         ramoops_region,
1985         devices,
1986         irq_chip,
1987         &mut vcpu_ids,
1988         cfg.dump_device_tree_blob.clone(),
1989         simple_jail(&cfg.jail_config, "serial_device")?,
1990         #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
1991         simple_jail(&cfg.jail_config, "block_device")?,
1992         #[cfg(feature = "swap")]
1993         swap_controller.as_ref(),
1994     )
1995     .context("the architecture failed to build the vm")?;
1996 
1997     if let Some(tube) = linux.vm_request_tube.take() {
1998         control_tubes.push(TaggedControlTube::Vm(tube));
1999     }
2000 
2001     #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
2002     let (hp_control_tube, hp_worker_tube) = mpsc::channel();
2003 
2004     #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
2005     let hp_thread = {
2006         for (bus_num, hp_bus) in hotplug_buses {
2007             linux.hotplug_bus.insert(bus_num, hp_bus);
2008         }
2009 
2010         if let Some(pm) = &linux.pm {
2011             while let Some((gpe, notify_dev)) = gpe_notify_devs.pop() {
2012                 pm.lock().register_gpe_notify_dev(gpe, notify_dev);
2013             }
2014             while let Some((bus, notify_dev)) = pme_notify_devs.pop() {
2015                 pm.lock().register_pme_notify_dev(bus, notify_dev);
2016             }
2017         }
2018 
2019         let pci_root = linux.root_config.clone();
2020         std::thread::Builder::new()
2021             .name("pci_root".to_string())
2022             .spawn(move || start_pci_root_worker(pci_root, hp_worker_tube))?
2023     };
2024 
2025     #[cfg(feature = "direct")]
2026     if let Some(pmio) = &cfg.direct_pmio {
2027         let direct_io = Arc::new(
2028             devices::DirectIo::new(&pmio.path, false).context("failed to open direct io device")?,
2029         );
2030         for range in pmio.ranges.iter() {
2031             linux
2032                 .io_bus
2033                 .insert_sync(direct_io.clone(), range.base, range.len)
2034                 .context("Error with pmio")?;
2035         }
2036     };
2037 
2038     #[cfg(feature = "direct")]
2039     if let Some(mmio) = &cfg.direct_mmio {
2040         let direct_mmio = Arc::new(
2041             devices::DirectMmio::new(&mmio.path, false, &mmio.ranges)
2042                 .context("failed to open direct mmio device")?,
2043         );
2044 
2045         for range in mmio.ranges.iter() {
2046             linux
2047                 .mmio_bus
2048                 .insert_sync(direct_mmio.clone(), range.base, range.len)
2049                 .context("Error with mmio")?;
2050         }
2051     };
2052 
2053     let gralloc = RutabagaGralloc::new().context("failed to create gralloc")?;
2054 
2055     run_control(
2056         linux,
2057         sys_allocator,
2058         cfg,
2059         control_server_socket,
2060         irq_control_tubes,
2061         control_tubes,
2062         #[cfg(feature = "balloon")]
2063         balloon_host_tube,
2064         #[cfg(feature = "balloon")]
2065         balloon_wss_host_tube,
2066         &disk_host_tubes,
2067         #[cfg(feature = "gpu")]
2068         gpu_control_host_tube,
2069         #[cfg(feature = "usb")]
2070         usb_control_tube,
2071         vm_evt_rdtube,
2072         vm_evt_wrtube,
2073         sigchld_fd,
2074         gralloc,
2075         vcpu_ids,
2076         iommu_host_tube,
2077         #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
2078         hp_control_tube,
2079         #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
2080         hp_thread,
2081         #[cfg(feature = "swap")]
2082         swap_controller,
2083         reg_evt_rdtube,
2084     )
2085 }
2086 
2087 // Hotplug command is facing dead lock issue when it tries to acquire the lock
2088 // for pci root in the vm control thread. Dead lock could happen when the vm
2089 // control thread(Thread A namely) is handling the hotplug command and it tries
2090 // to get the lock for pci root. However, the lock is already hold by another
2091 // device in thread B, which is actively sending an vm control to be handled by
2092 // thread A and waiting for response. However, thread A is blocked on acquiring
2093 // the lock, so dead lock happens. In order to resolve this issue, we add this
2094 // worker thread and push all work that locks pci root to this thread.
2095 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
start_pci_root_worker( pci_root: Arc<Mutex<PciRoot>>, hp_device_tube: mpsc::Receiver<PciRootCommand>, )2096 fn start_pci_root_worker(
2097     pci_root: Arc<Mutex<PciRoot>>,
2098     hp_device_tube: mpsc::Receiver<PciRootCommand>,
2099 ) {
2100     loop {
2101         match hp_device_tube.recv() {
2102             Ok(cmd) => match cmd {
2103                 PciRootCommand::Add(addr, device) => {
2104                     pci_root.lock().add_device(addr, device);
2105                 }
2106                 PciRootCommand::AddBridge(pci_bus) => pci_root.lock().add_bridge(pci_bus),
2107                 PciRootCommand::Remove(addr) => {
2108                     pci_root.lock().remove_device(addr);
2109                 }
2110                 PciRootCommand::Kill => break,
2111             },
2112             Err(e) => {
2113                 error!("Error: pci root worker channel closed: {}", e);
2114                 break;
2115             }
2116         }
2117     }
2118 }
2119 
2120 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
get_hp_bus<V: VmArch, Vcpu: VcpuArch>( linux: &RunnableLinuxVm<V, Vcpu>, host_addr: PciAddress, ) -> Result<Arc<Mutex<dyn HotPlugBus>>>2121 fn get_hp_bus<V: VmArch, Vcpu: VcpuArch>(
2122     linux: &RunnableLinuxVm<V, Vcpu>,
2123     host_addr: PciAddress,
2124 ) -> Result<Arc<Mutex<dyn HotPlugBus>>> {
2125     for (_, hp_bus) in linux.hotplug_bus.iter() {
2126         if hp_bus.lock().is_match(host_addr).is_some() {
2127             return Ok(hp_bus.clone());
2128         }
2129     }
2130     Err(anyhow!("Failed to find a suitable hotplug bus"))
2131 }
2132 
2133 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
add_hotplug_device<V: VmArch, Vcpu: VcpuArch>( linux: &mut RunnableLinuxVm<V, Vcpu>, sys_allocator: &mut SystemAllocator, cfg: &Config, irq_control_tubes: &mut Vec<Tube>, control_tubes: &mut Vec<TaggedControlTube>, hp_control_tube: &mpsc::Sender<PciRootCommand>, iommu_host_tube: &Option<Tube>, device: &HotPlugDeviceInfo, #[cfg(feature = "swap")] swap_controller: Option<&SwapController>, ) -> Result<()>2134 fn add_hotplug_device<V: VmArch, Vcpu: VcpuArch>(
2135     linux: &mut RunnableLinuxVm<V, Vcpu>,
2136     sys_allocator: &mut SystemAllocator,
2137     cfg: &Config,
2138     irq_control_tubes: &mut Vec<Tube>,
2139     control_tubes: &mut Vec<TaggedControlTube>,
2140     hp_control_tube: &mpsc::Sender<PciRootCommand>,
2141     iommu_host_tube: &Option<Tube>,
2142     device: &HotPlugDeviceInfo,
2143     #[cfg(feature = "swap")] swap_controller: Option<&SwapController>,
2144 ) -> Result<()> {
2145     let host_addr = PciAddress::from_path(&device.path)
2146         .context("failed to parse hotplug device's PCI address")?;
2147     let hp_bus = get_hp_bus(linux, host_addr)?;
2148 
2149     let (host_key, pci_address) = match device.device_type {
2150         HotPlugDeviceType::UpstreamPort | HotPlugDeviceType::DownstreamPort => {
2151             let (vm_host_tube, vm_device_tube) = Tube::pair().context("failed to create tube")?;
2152             control_tubes.push(TaggedControlTube::Vm(vm_host_tube));
2153             let (msi_host_tube, msi_device_tube) = Tube::pair().context("failed to create tube")?;
2154             irq_control_tubes.push(msi_host_tube);
2155             let pcie_host = PcieHostPort::new(device.path.as_path(), vm_device_tube)?;
2156             let (host_key, pci_bridge) = match device.device_type {
2157                 HotPlugDeviceType::UpstreamPort => {
2158                     let host_key = HostHotPlugKey::UpstreamPort { host_addr };
2159                     let pcie_upstream_port = Arc::new(Mutex::new(PcieUpstreamPort::new_from_host(
2160                         pcie_host, true,
2161                     )?));
2162                     let pci_bridge =
2163                         Box::new(PciBridge::new(pcie_upstream_port.clone(), msi_device_tube));
2164                     linux
2165                         .hotplug_bus
2166                         .insert(pci_bridge.get_secondary_num(), pcie_upstream_port);
2167                     (host_key, pci_bridge)
2168                 }
2169                 HotPlugDeviceType::DownstreamPort => {
2170                     let host_key = HostHotPlugKey::DownstreamPort { host_addr };
2171                     let pcie_downstream_port = Arc::new(Mutex::new(
2172                         PcieDownstreamPort::new_from_host(pcie_host, true)?,
2173                     ));
2174                     let pci_bridge = Box::new(PciBridge::new(
2175                         pcie_downstream_port.clone(),
2176                         msi_device_tube,
2177                     ));
2178                     linux
2179                         .hotplug_bus
2180                         .insert(pci_bridge.get_secondary_num(), pcie_downstream_port);
2181                     (host_key, pci_bridge)
2182                 }
2183                 _ => {
2184                     bail!("Impossible to reach here")
2185                 }
2186             };
2187             let pci_address = Arch::register_pci_device(
2188                 linux,
2189                 pci_bridge,
2190                 None,
2191                 sys_allocator,
2192                 hp_control_tube,
2193                 #[cfg(feature = "swap")]
2194                 swap_controller,
2195             )?;
2196 
2197             (host_key, pci_address)
2198         }
2199         HotPlugDeviceType::EndPoint => {
2200             let host_key = HostHotPlugKey::Vfio { host_addr };
2201             let (vfio_device, jail, viommu_mapper) = create_vfio_device(
2202                 &cfg.jail_config,
2203                 &linux.vm,
2204                 sys_allocator,
2205                 irq_control_tubes,
2206                 control_tubes,
2207                 &device.path,
2208                 true,
2209                 None,
2210                 None,
2211                 None,
2212                 if iommu_host_tube.is_some() {
2213                     IommuDevType::VirtioIommu
2214                 } else {
2215                     IommuDevType::NoIommu
2216                 },
2217                 #[cfg(feature = "direct")]
2218                 false,
2219             )?;
2220             let vfio_pci_device = match vfio_device {
2221                 VfioDeviceVariant::Pci(pci) => Box::new(pci),
2222                 VfioDeviceVariant::Platform(_) => bail!("vfio platform hotplug not supported"),
2223             };
2224             let pci_address = Arch::register_pci_device(
2225                 linux,
2226                 vfio_pci_device,
2227                 jail,
2228                 sys_allocator,
2229                 hp_control_tube,
2230                 #[cfg(feature = "swap")]
2231                 swap_controller,
2232             )?;
2233             if let Some(iommu_host_tube) = iommu_host_tube {
2234                 let endpoint_addr = pci_address.to_u32();
2235                 let vfio_wrapper = viommu_mapper.context("expected mapper")?;
2236                 let descriptor = vfio_wrapper.clone_as_raw_descriptor()?;
2237                 let request =
2238                     VirtioIOMMURequest::VfioCommand(VirtioIOMMUVfioCommand::VfioDeviceAdd {
2239                         endpoint_addr,
2240                         wrapper_id: vfio_wrapper.id(),
2241                         container: {
2242                             // Safe because the descriptor is uniquely owned by `descriptor`.
2243                             unsafe { File::from_raw_descriptor(descriptor) }
2244                         },
2245                     });
2246                 match virtio_iommu_request(iommu_host_tube, &request)
2247                     .map_err(|_| VirtioIOMMUVfioError::SocketFailed)?
2248                 {
2249                     VirtioIOMMUResponse::VfioResponse(VirtioIOMMUVfioResult::Ok) => (),
2250                     resp => bail!("Unexpected message response: {:?}", resp),
2251                 }
2252             }
2253 
2254             (host_key, pci_address)
2255         }
2256     };
2257     hp_bus.lock().add_hotplug_device(host_key, pci_address);
2258     if device.hp_interrupt {
2259         hp_bus.lock().hot_plug(pci_address);
2260     }
2261     Ok(())
2262 }
2263 
2264 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
remove_hotplug_bridge<V: VmArch, Vcpu: VcpuArch>( linux: &RunnableLinuxVm<V, Vcpu>, sys_allocator: &mut SystemAllocator, buses_to_remove: &mut Vec<u8>, host_key: HostHotPlugKey, child_bus: u8, ) -> Result<()>2265 fn remove_hotplug_bridge<V: VmArch, Vcpu: VcpuArch>(
2266     linux: &RunnableLinuxVm<V, Vcpu>,
2267     sys_allocator: &mut SystemAllocator,
2268     buses_to_remove: &mut Vec<u8>,
2269     host_key: HostHotPlugKey,
2270     child_bus: u8,
2271 ) -> Result<()> {
2272     for (bus_num, hp_bus) in linux.hotplug_bus.iter() {
2273         let mut hp_bus_lock = hp_bus.lock();
2274         if let Some(pci_addr) = hp_bus_lock.get_hotplug_device(host_key) {
2275             sys_allocator.release_pci(pci_addr.bus, pci_addr.dev, pci_addr.func);
2276             hp_bus_lock.hot_unplug(pci_addr);
2277             buses_to_remove.push(child_bus);
2278             if hp_bus_lock.is_empty() {
2279                 if let Some(hotplug_key) = hp_bus_lock.get_hotplug_key() {
2280                     remove_hotplug_bridge(
2281                         linux,
2282                         sys_allocator,
2283                         buses_to_remove,
2284                         hotplug_key,
2285                         *bus_num,
2286                     )?;
2287                 }
2288             }
2289             return Ok(());
2290         }
2291     }
2292 
2293     Err(anyhow!(
2294         "Can not find device {:?} on hotplug buses",
2295         host_key
2296     ))
2297 }
2298 
2299 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
remove_hotplug_device<V: VmArch, Vcpu: VcpuArch>( linux: &mut RunnableLinuxVm<V, Vcpu>, sys_allocator: &mut SystemAllocator, iommu_host_tube: &Option<Tube>, device: &HotPlugDeviceInfo, ) -> Result<()>2300 fn remove_hotplug_device<V: VmArch, Vcpu: VcpuArch>(
2301     linux: &mut RunnableLinuxVm<V, Vcpu>,
2302     sys_allocator: &mut SystemAllocator,
2303     iommu_host_tube: &Option<Tube>,
2304     device: &HotPlugDeviceInfo,
2305 ) -> Result<()> {
2306     let host_addr = PciAddress::from_path(&device.path)?;
2307     let host_key = match device.device_type {
2308         HotPlugDeviceType::UpstreamPort => HostHotPlugKey::UpstreamPort { host_addr },
2309         HotPlugDeviceType::DownstreamPort => HostHotPlugKey::DownstreamPort { host_addr },
2310         HotPlugDeviceType::EndPoint => HostHotPlugKey::Vfio { host_addr },
2311     };
2312 
2313     let hp_bus = linux
2314         .hotplug_bus
2315         .iter()
2316         .find(|(_, hp_bus)| {
2317             let hp_bus = hp_bus.lock();
2318             hp_bus.get_hotplug_device(host_key).is_some()
2319         })
2320         .map(|(bus_num, hp_bus)| (*bus_num, hp_bus.clone()));
2321 
2322     if let Some((bus_num, hp_bus)) = hp_bus {
2323         let mut buses_to_remove = Vec::new();
2324         let mut removed_key = None;
2325         let mut hp_bus_lock = hp_bus.lock();
2326         if let Some(pci_addr) = hp_bus_lock.get_hotplug_device(host_key) {
2327             if let Some(iommu_host_tube) = iommu_host_tube {
2328                 let request =
2329                     VirtioIOMMURequest::VfioCommand(VirtioIOMMUVfioCommand::VfioDeviceDel {
2330                         endpoint_addr: pci_addr.to_u32(),
2331                     });
2332                 match virtio_iommu_request(iommu_host_tube, &request)
2333                     .map_err(|_| VirtioIOMMUVfioError::SocketFailed)?
2334                 {
2335                     VirtioIOMMUResponse::VfioResponse(VirtioIOMMUVfioResult::Ok) => (),
2336                     resp => bail!("Unexpected message response: {:?}", resp),
2337                 }
2338             }
2339             let mut empty_simbling = true;
2340             if let Some(HostHotPlugKey::DownstreamPort { host_addr }) =
2341                 hp_bus_lock.get_hotplug_key()
2342             {
2343                 let addr_alias = host_addr;
2344                 for (simbling_bus_num, hp_bus) in linux.hotplug_bus.iter() {
2345                     if *simbling_bus_num != bus_num {
2346                         let hp_bus_lock = hp_bus.lock();
2347                         let hotplug_key = hp_bus_lock.get_hotplug_key();
2348                         if let Some(HostHotPlugKey::DownstreamPort { host_addr }) = hotplug_key {
2349                             if addr_alias.bus == host_addr.bus && !hp_bus_lock.is_empty() {
2350                                 empty_simbling = false;
2351                                 break;
2352                             }
2353                         }
2354                     }
2355                 }
2356             }
2357 
2358             // If all simbling downstream ports are empty, do not send hot unplug event for this
2359             // downstream port. Root port will send one plug out interrupt and remove all
2360             // the remaining devices
2361             if !empty_simbling {
2362                 hp_bus_lock.hot_unplug(pci_addr);
2363             }
2364 
2365             sys_allocator.release_pci(pci_addr.bus, pci_addr.dev, pci_addr.func);
2366             if empty_simbling || hp_bus_lock.is_empty() {
2367                 if let Some(hotplug_key) = hp_bus_lock.get_hotplug_key() {
2368                     removed_key = Some(hotplug_key);
2369                     remove_hotplug_bridge(
2370                         linux,
2371                         sys_allocator,
2372                         &mut buses_to_remove,
2373                         hotplug_key,
2374                         bus_num,
2375                     )?;
2376                 }
2377             }
2378         }
2379 
2380         // Some types of TBT device has a few empty downstream ports. The emulated bridges
2381         // of these ports won't be removed since no vfio device is connected to our emulated
2382         // bridges. So we explicitly check all simbling bridges of the removed bridge here,
2383         // and remove them if bridge has no child device connected.
2384         if let Some(HostHotPlugKey::DownstreamPort { host_addr }) = removed_key {
2385             let addr_alias = host_addr;
2386             for (simbling_bus_num, hp_bus) in linux.hotplug_bus.iter() {
2387                 if *simbling_bus_num != bus_num {
2388                     let hp_bus_lock = hp_bus.lock();
2389                     let hotplug_key = hp_bus_lock.get_hotplug_key();
2390                     if let Some(HostHotPlugKey::DownstreamPort { host_addr }) = hotplug_key {
2391                         if addr_alias.bus == host_addr.bus && hp_bus_lock.is_empty() {
2392                             remove_hotplug_bridge(
2393                                 linux,
2394                                 sys_allocator,
2395                                 &mut buses_to_remove,
2396                                 hotplug_key.unwrap(),
2397                                 *simbling_bus_num,
2398                             )?;
2399                         }
2400                     }
2401                 }
2402             }
2403         }
2404         for bus in buses_to_remove.iter() {
2405             linux.hotplug_bus.remove(bus);
2406         }
2407         return Ok(());
2408     }
2409 
2410     Err(anyhow!(
2411         "Can not find device {:?} on hotplug buses",
2412         host_key
2413     ))
2414 }
2415 
trigger_vm_suspend_and_wait_for_entry( guest_suspended_cvar: Arc<(Mutex<bool>, Condvar)>, tube: &SendTube, response: vm_control::VmResponse, suspend_evt: Event, pm: Option<Arc<Mutex<dyn PmResource + Send>>>, )2416 pub fn trigger_vm_suspend_and_wait_for_entry(
2417     guest_suspended_cvar: Arc<(Mutex<bool>, Condvar)>,
2418     tube: &SendTube,
2419     response: vm_control::VmResponse,
2420     suspend_evt: Event,
2421     pm: Option<Arc<Mutex<dyn PmResource + Send>>>,
2422 ) {
2423     let (lock, cvar) = &*guest_suspended_cvar;
2424     let mut guest_suspended = lock.lock();
2425 
2426     *guest_suspended = false;
2427 
2428     // During suspend also emulate sleepbtn, which allows to suspend VM (if running e.g. acpid and
2429     // reacts on sleep button events)
2430     if let Some(pm) = pm {
2431         pm.lock().slpbtn_evt();
2432     } else {
2433         error!("generating sleepbtn during suspend not supported");
2434     }
2435 
2436     // Wait for notification about guest suspension, if not received after 15sec,
2437     // proceed anyway.
2438     let result = cvar.wait_timeout(guest_suspended, std::time::Duration::from_secs(15));
2439     guest_suspended = result.0;
2440 
2441     if result.1.timed_out() {
2442         warn!("Guest suspension timeout - proceeding anyway");
2443     } else if *guest_suspended {
2444         info!("Guest suspended");
2445     }
2446 
2447     if let Err(e) = suspend_evt.signal() {
2448         error!("failed to trigger suspend event: {}", e);
2449     }
2450     // Now we ready to send response over the tube and communicate that VM suspend has finished
2451     if let Err(e) = tube.send(&response) {
2452         error!("failed to send VmResponse: {}", e);
2453     }
2454 }
2455 
2456 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
handle_hotplug_command<V: VmArch, Vcpu: VcpuArch>( linux: &mut RunnableLinuxVm<V, Vcpu>, sys_allocator: &mut SystemAllocator, cfg: &Config, add_irq_control_tubes: &mut Vec<Tube>, add_tubes: &mut Vec<TaggedControlTube>, hp_control_tube: &mpsc::Sender<PciRootCommand>, iommu_host_tube: &Option<Tube>, device: &HotPlugDeviceInfo, add: bool, #[cfg(feature = "swap")] swap_controller: Option<&SwapController>, ) -> VmResponse2457 fn handle_hotplug_command<V: VmArch, Vcpu: VcpuArch>(
2458     linux: &mut RunnableLinuxVm<V, Vcpu>,
2459     sys_allocator: &mut SystemAllocator,
2460     cfg: &Config,
2461     add_irq_control_tubes: &mut Vec<Tube>,
2462     add_tubes: &mut Vec<TaggedControlTube>,
2463     hp_control_tube: &mpsc::Sender<PciRootCommand>,
2464     iommu_host_tube: &Option<Tube>,
2465     device: &HotPlugDeviceInfo,
2466     add: bool,
2467     #[cfg(feature = "swap")] swap_controller: Option<&SwapController>,
2468 ) -> VmResponse {
2469     let iommu_host_tube = if cfg.vfio_isolate_hotplug {
2470         iommu_host_tube
2471     } else {
2472         &None
2473     };
2474 
2475     let ret = if add {
2476         add_hotplug_device(
2477             linux,
2478             sys_allocator,
2479             cfg,
2480             add_irq_control_tubes,
2481             add_tubes,
2482             hp_control_tube,
2483             iommu_host_tube,
2484             device,
2485             #[cfg(feature = "swap")]
2486             swap_controller,
2487         )
2488     } else {
2489         remove_hotplug_device(linux, sys_allocator, iommu_host_tube, device)
2490     };
2491 
2492     match ret {
2493         Ok(()) => VmResponse::Ok,
2494         Err(e) => {
2495             error!("hanlde_hotplug_command failure: {}", e);
2496             add_tubes.clear();
2497             VmResponse::Err(base::Error::new(libc::EINVAL))
2498         }
2499     }
2500 }
2501 
run_control<V: VmArch + 'static, Vcpu: VcpuArch + 'static>( mut linux: RunnableLinuxVm<V, Vcpu>, sys_allocator: SystemAllocator, cfg: Config, control_server_socket: Option<UnlinkUnixSeqpacketListener>, irq_control_tubes: Vec<Tube>, mut control_tubes: Vec<TaggedControlTube>, #[cfg(feature = "balloon")] balloon_host_tube: Option<Tube>, #[cfg(feature = "balloon")] balloon_wss_host_tube: Option<Tube>, disk_host_tubes: &[Tube], #[cfg(feature = "gpu")] gpu_control_tube: Tube, #[cfg(feature = "usb")] usb_control_tube: Tube, vm_evt_rdtube: RecvTube, vm_evt_wrtube: SendTube, sigchld_fd: SignalFd, mut gralloc: RutabagaGralloc, vcpu_ids: Vec<usize>, iommu_host_tube: Option<Tube>, #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] hp_control_tube: mpsc::Sender< PciRootCommand, >, #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] hp_thread: std::thread::JoinHandle<()>, #[cfg(feature = "swap")] swap_controller: Option<SwapController>, reg_evt_rdtube: RecvTube, ) -> Result<ExitState>2502 fn run_control<V: VmArch + 'static, Vcpu: VcpuArch + 'static>(
2503     mut linux: RunnableLinuxVm<V, Vcpu>,
2504     sys_allocator: SystemAllocator,
2505     cfg: Config,
2506     control_server_socket: Option<UnlinkUnixSeqpacketListener>,
2507     irq_control_tubes: Vec<Tube>,
2508     mut control_tubes: Vec<TaggedControlTube>,
2509     #[cfg(feature = "balloon")] balloon_host_tube: Option<Tube>,
2510     #[cfg(feature = "balloon")] balloon_wss_host_tube: Option<Tube>,
2511     disk_host_tubes: &[Tube],
2512     #[cfg(feature = "gpu")] gpu_control_tube: Tube,
2513     #[cfg(feature = "usb")] usb_control_tube: Tube,
2514     vm_evt_rdtube: RecvTube,
2515     vm_evt_wrtube: SendTube,
2516     sigchld_fd: SignalFd,
2517     mut gralloc: RutabagaGralloc,
2518     vcpu_ids: Vec<usize>,
2519     iommu_host_tube: Option<Tube>,
2520     #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] hp_control_tube: mpsc::Sender<
2521         PciRootCommand,
2522     >,
2523     #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] hp_thread: std::thread::JoinHandle<()>,
2524     #[cfg(feature = "swap")] swap_controller: Option<SwapController>,
2525     reg_evt_rdtube: RecvTube,
2526 ) -> Result<ExitState> {
2527     #[derive(EventToken)]
2528     enum Token {
2529         VmEvent,
2530         Suspend,
2531         ChildSignal,
2532         VmControlServer,
2533         VmControl { index: usize },
2534         RegisteredEvent,
2535     }
2536 
2537     // Tube keyed on the socket path used to create it.
2538     struct AddressedTube {
2539         tube: Rc<Tube>,
2540         socket_addr: String,
2541     }
2542 
2543     impl PartialEq for AddressedTube {
2544         fn eq(&self, other: &Self) -> bool {
2545             self.socket_addr == other.socket_addr
2546         }
2547     }
2548 
2549     impl Eq for AddressedTube {}
2550 
2551     impl Hash for AddressedTube {
2552         fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
2553             self.socket_addr.hash(state);
2554         }
2555     }
2556 
2557     impl AddressedTube {
2558         pub fn send<T: Serialize>(&self, msg: &T) -> Result<(), base::TubeError> {
2559             self.tube.send(msg)
2560         }
2561     }
2562 
2563     fn find_registered_tube<'a>(
2564         registered_tubes: &'a HashMap<RegisteredEvent, HashSet<AddressedTube>>,
2565         socket_addr: &str,
2566         event: RegisteredEvent,
2567     ) -> (Option<&'a Rc<Tube>>, bool) {
2568         let mut registered_tube: Option<&Rc<Tube>> = None;
2569         let mut already_registered = false;
2570         'outer: for (evt, addr_tubes) in registered_tubes {
2571             for addr_tube in addr_tubes {
2572                 if addr_tube.socket_addr == socket_addr {
2573                     if *evt == event {
2574                         already_registered = true;
2575                         break 'outer;
2576                     }
2577                     // Since all tubes of the same addr should
2578                     // be an RC to the same tube, it doesn't
2579                     // matter which one we get. But we do need
2580                     // to check for a registration for the
2581                     // current event, so can't break here.
2582                     registered_tube = Some(&addr_tube.tube);
2583                 }
2584             }
2585         }
2586         (registered_tube, already_registered)
2587     }
2588 
2589     fn make_addr_tube_from_maybe_existing(
2590         tube: Option<&Rc<Tube>>,
2591         addr: String,
2592     ) -> Result<AddressedTube> {
2593         if let Some(registered_tube) = tube {
2594             Ok(AddressedTube {
2595                 tube: registered_tube.clone(),
2596                 socket_addr: addr,
2597             })
2598         } else {
2599             let sock = UnixSeqpacket::connect(addr.clone()).with_context(|| {
2600                 format!("failed to connect to registered listening socket {}", addr)
2601             })?;
2602             let tube = Tube::new_from_unix_seqpacket(sock);
2603             Ok(AddressedTube {
2604                 tube: Rc::new(tube),
2605                 socket_addr: addr,
2606             })
2607         }
2608     }
2609 
2610     let mut iommu_client = iommu_host_tube
2611         .as_ref()
2612         .map(VmMemoryRequestIommuClient::new);
2613 
2614     stdin()
2615         .set_raw_mode()
2616         .expect("failed to set terminal raw mode");
2617 
2618     let sys_allocator_mutex = Arc::new(Mutex::new(sys_allocator));
2619 
2620     let wait_ctx = WaitContext::build_with(&[
2621         (&linux.suspend_evt, Token::Suspend),
2622         (&sigchld_fd, Token::ChildSignal),
2623         (&vm_evt_rdtube, Token::VmEvent),
2624         (&reg_evt_rdtube, Token::RegisteredEvent),
2625     ])
2626     .context("failed to build wait context")?;
2627 
2628     if let Some(socket_server) = &control_server_socket {
2629         wait_ctx
2630             .add(socket_server, Token::VmControlServer)
2631             .context("failed to add descriptor to wait context")?;
2632     }
2633     for (index, socket) in control_tubes.iter().enumerate() {
2634         wait_ctx
2635             .add(socket.as_ref(), Token::VmControl { index })
2636             .context("failed to add descriptor to wait context")?;
2637     }
2638 
2639     if cfg.jail_config.is_some() {
2640         // Before starting VCPUs, in case we started with some capabilities, drop them all.
2641         drop_capabilities().context("failed to drop process capabilities")?;
2642     }
2643 
2644     #[cfg(all(any(target_arch = "x86_64", target_arch = "aarch64"), feature = "gdb"))]
2645     // Create a channel for GDB thread.
2646     let (to_gdb_channel, from_vcpu_channel) = if linux.gdb.is_some() {
2647         let (s, r) = mpsc::channel();
2648         (Some(s), Some(r))
2649     } else {
2650         (None, None)
2651     };
2652 
2653     let (device_ctrl_tube, device_ctrl_resp) = Tube::pair().context("failed to create tube")?;
2654     // Create devices thread, and restore if a restore file exists.
2655     linux.devices_thread = match create_devices_worker_thread(
2656         linux.vm.get_memory().clone(),
2657         linux.io_bus.clone(),
2658         linux.mmio_bus.clone(),
2659         device_ctrl_resp,
2660     ) {
2661         Ok(join_handle) => Some(join_handle),
2662         Err(e) => {
2663             return Err(anyhow!("Failed to start devices thread: {}", e));
2664         }
2665     };
2666 
2667     let mut vcpu_handles = Vec::with_capacity(linux.vcpu_count);
2668     let vcpu_thread_barrier = Arc::new(Barrier::new(linux.vcpu_count + 1));
2669     let use_hypervisor_signals = !linux
2670         .vm
2671         .get_hypervisor()
2672         .check_capability(HypervisorCap::ImmediateExit);
2673     vcpu::setup_vcpu_signal_handler::<Vcpu>(use_hypervisor_signals)?;
2674 
2675     let vcpus: Vec<Option<_>> = match linux.vcpus.take() {
2676         Some(vec) => vec.into_iter().map(Some).collect(),
2677         None => iter::repeat_with(|| None).take(linux.vcpu_count).collect(),
2678     };
2679     // Enable core scheduling before creating vCPUs so that the cookie will be
2680     // shared by all vCPU threads.
2681     // TODO(b/199312402): Avoid enabling core scheduling for the crosvm process
2682     // itself for even better performance. Only vCPUs need the feature.
2683     if cfg.core_scheduling && cfg.per_vm_core_scheduling {
2684         if let Err(e) = enable_core_scheduling() {
2685             error!("Failed to enable core scheduling: {}", e);
2686         }
2687     }
2688     let vcpu_cgroup_tasks_file = match &cfg.vcpu_cgroup_path {
2689         None => None,
2690         Some(cgroup_path) => {
2691             // Move main process to cgroup_path
2692             let mut f = File::create(&cgroup_path.join("tasks")).with_context(|| {
2693                 format!(
2694                     "failed to create vcpu-cgroup-path {}",
2695                     cgroup_path.display(),
2696                 )
2697             })?;
2698             f.write_all(process::id().to_string().as_bytes())?;
2699             Some(f)
2700         }
2701     };
2702     #[cfg(all(any(target_arch = "x86", target_arch = "x86_64"), unix))]
2703     let bus_lock_ratelimit_ctrl: Arc<Mutex<Ratelimit>> = Arc::new(Mutex::new(Ratelimit::new()));
2704     #[cfg(all(any(target_arch = "x86", target_arch = "x86_64"), unix))]
2705     if cfg.bus_lock_ratelimit > 0 {
2706         let bus_lock_ratelimit = cfg.bus_lock_ratelimit;
2707         if linux.vm.check_capability(VmCap::BusLockDetect) {
2708             info!("Hypervisor support bus lock detect");
2709             linux
2710                 .vm
2711                 .enable_capability(VmCap::BusLockDetect, 0)
2712                 .expect("kvm: Failed to enable bus lock detection cap");
2713             info!("Hypervisor enabled bus lock detect");
2714             bus_lock_ratelimit_ctrl
2715                 .lock()
2716                 .ratelimit_set_speed(bus_lock_ratelimit);
2717         } else {
2718             bail!("Kvm: bus lock detection unsuported");
2719         }
2720     }
2721 
2722     #[cfg(target_os = "android")]
2723     android::set_process_profiles(&cfg.task_profiles)?;
2724 
2725     let guest_suspended_cvar = Arc::new((Mutex::new(false), Condvar::new()));
2726 
2727     #[allow(unused_mut)]
2728     let mut run_mode = VmRunMode::Running;
2729     #[cfg(all(any(target_arch = "x86_64", target_arch = "aarch64"), feature = "gdb"))]
2730     if to_gdb_channel.is_some() {
2731         // Wait until a GDB client attaches
2732         run_mode = VmRunMode::Breakpoint;
2733     }
2734     // If we are restoring from a snapshot, then start suspended.
2735     let (run_mode, post_restore_run_mode) = if cfg.restore_path.is_some() {
2736         (VmRunMode::Suspending, run_mode)
2737     } else {
2738         (run_mode, run_mode)
2739     };
2740 
2741     // Architecture-specific code must supply a vcpu_init element for each VCPU.
2742     assert_eq!(vcpus.len(), linux.vcpu_init.len());
2743 
2744     for ((cpu_id, vcpu), vcpu_init) in vcpus.into_iter().enumerate().zip(linux.vcpu_init.drain(..))
2745     {
2746         let (to_vcpu_channel, from_main_channel) = mpsc::channel();
2747         let vcpu_affinity = match linux.vcpu_affinity.clone() {
2748             Some(VcpuAffinity::Global(v)) => v,
2749             Some(VcpuAffinity::PerVcpu(mut m)) => m.remove(&cpu_id).unwrap_or_default(),
2750             None => Default::default(),
2751         };
2752 
2753         #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
2754         let vcpu_hybrid_type = if !cfg.vcpu_hybrid_type.is_empty() {
2755             Some(*cfg.vcpu_hybrid_type.get(&cpu_id).unwrap())
2756         } else {
2757             None
2758         };
2759 
2760         #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
2761         let cpu_config = Some(CpuConfigX86_64::new(
2762             cfg.force_calibrated_tsc_leaf,
2763             cfg.host_cpu_topology,
2764             cfg.enable_hwp,
2765             cfg.enable_pnp_data,
2766             cfg.no_smt,
2767             cfg.itmt,
2768             vcpu_hybrid_type,
2769         ));
2770         #[cfg(all(any(target_arch = "x86", target_arch = "x86_64"), unix))]
2771         let bus_lock_ratelimit_ctrl = Arc::clone(&bus_lock_ratelimit_ctrl);
2772 
2773         #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
2774         let cpu_config = None;
2775 
2776         let handle = vcpu::run_vcpu(
2777             cpu_id,
2778             vcpu_ids[cpu_id],
2779             vcpu,
2780             vcpu_init,
2781             linux.vm.try_clone().context("failed to clone vm")?,
2782             linux
2783                 .irq_chip
2784                 .try_box_clone()
2785                 .context("failed to clone irqchip")?,
2786             linux.vcpu_count,
2787             linux.rt_cpus.contains(&cpu_id),
2788             vcpu_affinity,
2789             linux.delay_rt,
2790             vcpu_thread_barrier.clone(),
2791             linux.has_bios,
2792             (*linux.io_bus).clone(),
2793             (*linux.mmio_bus).clone(),
2794             vm_evt_wrtube
2795                 .try_clone()
2796                 .context("failed to clone vm event tube")?,
2797             linux.vm.check_capability(VmCap::PvClockSuspend),
2798             from_main_channel,
2799             use_hypervisor_signals,
2800             #[cfg(all(any(target_arch = "x86_64", target_arch = "aarch64"), feature = "gdb"))]
2801             to_gdb_channel.clone(),
2802             cfg.core_scheduling,
2803             cfg.per_vm_core_scheduling,
2804             cpu_config,
2805             cfg.privileged_vm,
2806             match vcpu_cgroup_tasks_file {
2807                 None => None,
2808                 Some(ref f) => Some(
2809                     f.try_clone()
2810                         .context("failed to clone vcpu cgroup tasks file")?,
2811                 ),
2812             },
2813             cfg.userspace_msr.clone(),
2814             guest_suspended_cvar.clone(),
2815             #[cfg(all(any(target_arch = "x86", target_arch = "x86_64"), unix))]
2816             bus_lock_ratelimit_ctrl,
2817             run_mode,
2818         )?;
2819         vcpu_handles.push((handle, to_vcpu_channel));
2820     }
2821 
2822     #[cfg(all(any(target_arch = "x86_64", target_arch = "aarch64"), feature = "gdb"))]
2823     // Spawn GDB thread.
2824     if let Some((gdb_port_num, gdb_control_tube)) = linux.gdb.take() {
2825         let to_vcpu_channels = vcpu_handles
2826             .iter()
2827             .map(|(_handle, channel)| channel.clone())
2828             .collect();
2829         let target = GdbStub::new(
2830             gdb_control_tube,
2831             to_vcpu_channels,
2832             from_vcpu_channel.unwrap(), // Must succeed to unwrap()
2833         );
2834         std::thread::Builder::new()
2835             .name("gdb".to_owned())
2836             .spawn(move || gdb_thread(target, gdb_port_num))
2837             .context("failed to spawn GDB thread")?;
2838     };
2839 
2840     let (irq_handler_control, irq_handler_control_for_thread) = Tube::pair()?;
2841     let sys_allocator_for_thread = sys_allocator_mutex.clone();
2842     let irq_chip_for_thread = linux.irq_chip.try_box_clone()?;
2843     let irq_handler_thread = std::thread::Builder::new()
2844         .name("irq_handler_thread".into())
2845         .spawn(move || {
2846             irq_handler_thread(
2847                 irq_control_tubes,
2848                 irq_chip_for_thread,
2849                 sys_allocator_for_thread,
2850                 irq_handler_control_for_thread,
2851             )
2852         })
2853         .unwrap();
2854 
2855     vcpu_thread_barrier.wait();
2856 
2857     // Restore VM (if applicable).
2858     // Must happen after the vCPU barrier to avoid deadlock.
2859     if let Some(path) = &cfg.restore_path {
2860         vm_control::do_restore(
2861             path.clone(),
2862             |msg| vcpu::kick_all_vcpus(&vcpu_handles, linux.irq_chip.as_irq_chip(), msg),
2863             |msg, index| {
2864                 vcpu::kick_vcpu(&vcpu_handles.get(index), linux.irq_chip.as_irq_chip(), msg)
2865             },
2866             &device_ctrl_tube,
2867             linux.vcpu_count,
2868         )?;
2869         // Allow the vCPUs to start for real.
2870         vcpu::kick_all_vcpus(
2871             &vcpu_handles,
2872             linux.irq_chip.as_irq_chip(),
2873             VcpuControl::RunState(post_restore_run_mode),
2874         )
2875     }
2876 
2877     let mut exit_state = ExitState::Stop;
2878     let mut pvpanic_code = PvPanicCode::Unknown;
2879     #[cfg(feature = "balloon")]
2880     let mut balloon_stats_id: u64 = 0;
2881     #[cfg(feature = "balloon")]
2882     let mut balloon_wss_id: u64 = 0;
2883     let mut registered_evt_tubes: HashMap<RegisteredEvent, HashSet<AddressedTube>> = HashMap::new();
2884 
2885     'wait: loop {
2886         let events = {
2887             match wait_ctx.wait() {
2888                 Ok(v) => v,
2889                 Err(e) => {
2890                     error!("failed to poll: {}", e);
2891                     break;
2892                 }
2893             }
2894         };
2895 
2896         let mut vm_control_indices_to_remove = Vec::new();
2897         for event in events.iter().filter(|e| e.is_readable) {
2898             match event.token {
2899                 Token::RegisteredEvent => match reg_evt_rdtube.recv::<RegisteredEvent>() {
2900                     Ok(reg_evt) => {
2901                         let mut tubes_to_remove: Vec<String> = Vec::new();
2902                         if let Some(tubes) = registered_evt_tubes.get_mut(&reg_evt) {
2903                             for tube in tubes.iter() {
2904                                 if let Err(e) = tube.send(&reg_evt) {
2905                                     warn!(
2906                                         "failed to send registered event {:?} to {}, removing from \
2907                                          registrations: {}",
2908                                         reg_evt, tube.socket_addr, e
2909                                     );
2910                                     tubes_to_remove.push(tube.socket_addr.clone());
2911                                 }
2912                             }
2913                         }
2914                         for tube_addr in tubes_to_remove {
2915                             for tubes in registered_evt_tubes.values_mut() {
2916                                 tubes.retain(|t| t.socket_addr != tube_addr);
2917                             }
2918                         }
2919                         registered_evt_tubes.retain(|_, tubes| !tubes.is_empty());
2920                     }
2921                     Err(e) => {
2922                         warn!("failed to recv RegisteredEvent: {}", e);
2923                     }
2924                 },
2925                 Token::VmEvent => {
2926                     let mut break_to_wait: bool = true;
2927                     match vm_evt_rdtube.recv::<VmEventType>() {
2928                         Ok(vm_event) => match vm_event {
2929                             VmEventType::Exit => {
2930                                 info!("vcpu requested shutdown");
2931                                 exit_state = ExitState::Stop;
2932                             }
2933                             VmEventType::Reset => {
2934                                 info!("vcpu requested reset");
2935                                 exit_state = ExitState::Reset;
2936                             }
2937                             VmEventType::Crash => {
2938                                 info!("vcpu crashed");
2939                                 exit_state = ExitState::Crash;
2940                             }
2941                             VmEventType::Panic(panic_code) => {
2942                                 pvpanic_code = PvPanicCode::from_u8(panic_code);
2943                                 info!("Guest reported panic [Code: {}]", pvpanic_code);
2944                                 break_to_wait = false;
2945                             }
2946                             VmEventType::WatchdogReset => {
2947                                 info!("vcpu stall detected");
2948                                 exit_state = ExitState::WatchdogReset;
2949                             }
2950                         },
2951                         Err(e) => {
2952                             warn!("failed to recv VmEvent: {}", e);
2953                         }
2954                     }
2955                     if break_to_wait {
2956                         if pvpanic_code == PvPanicCode::Panicked {
2957                             exit_state = ExitState::GuestPanic;
2958                         }
2959                         break 'wait;
2960                     }
2961                 }
2962                 Token::Suspend => {
2963                     info!("VM requested suspend");
2964                     linux.suspend_evt.wait().unwrap();
2965                     vcpu::kick_all_vcpus(
2966                         &vcpu_handles,
2967                         linux.irq_chip.as_irq_chip(),
2968                         VcpuControl::RunState(VmRunMode::Suspending),
2969                     );
2970                 }
2971                 Token::ChildSignal => {
2972                     // Print all available siginfo structs, then exit the loop if child process has
2973                     // been exited except CLD_STOPPED and CLD_CONTINUED. the two should be ignored
2974                     // here since they are used by the vmm-swap feature.
2975                     let mut do_exit = false;
2976                     while let Some(siginfo) =
2977                         sigchld_fd.read().context("failed to create signalfd")?
2978                     {
2979                         let pid = siginfo.ssi_pid;
2980                         let pid_label = match linux.pid_debug_label_map.get(&pid) {
2981                             Some(label) => format!("{} (pid {})", label, pid),
2982                             None => format!("pid {}", pid),
2983                         };
2984 
2985                         // TODO(kawasin): this is a temporary exception until device suspension.
2986                         #[cfg(feature = "swap")]
2987                         if siginfo.ssi_code == libc::CLD_STOPPED
2988                             || siginfo.ssi_code == libc::CLD_CONTINUED
2989                         {
2990                             continue;
2991                         }
2992 
2993                         error!(
2994                             "child {} exited: signo {}, status {}, code {}",
2995                             pid_label, siginfo.ssi_signo, siginfo.ssi_status, siginfo.ssi_code
2996                         );
2997                         do_exit = true;
2998                     }
2999                     if do_exit {
3000                         exit_state = ExitState::Crash;
3001                         break 'wait;
3002                     }
3003                 }
3004                 Token::VmControlServer => {
3005                     if let Some(socket_server) = &control_server_socket {
3006                         match socket_server.accept() {
3007                             Ok(socket) => {
3008                                 wait_ctx
3009                                     .add(
3010                                         &socket,
3011                                         Token::VmControl {
3012                                             index: control_tubes.len(),
3013                                         },
3014                                     )
3015                                     .context("failed to add descriptor to wait context")?;
3016                                 control_tubes.push(TaggedControlTube::Vm(
3017                                     Tube::new_from_unix_seqpacket(socket),
3018                                 ));
3019                             }
3020                             Err(e) => error!("failed to accept socket: {}", e),
3021                         }
3022                     }
3023                 }
3024                 Token::VmControl { index } => {
3025                     #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
3026                     let mut add_tubes = Vec::new();
3027                     #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
3028                     let mut add_irq_control_tubes = Vec::new();
3029                     if let Some(socket) = control_tubes.get(index) {
3030                         match socket {
3031                             TaggedControlTube::Vm(tube) => match tube.recv::<VmRequest>() {
3032                                 Ok(request) => {
3033                                     let mut suspend_requested = false;
3034                                     let mut run_mode_opt = None;
3035                                     let response = match request {
3036                                         VmRequest::HotPlugCommand { device, add } => {
3037                                             #[cfg(any(
3038                                                 target_arch = "x86",
3039                                                 target_arch = "x86_64"
3040                                             ))]
3041                                             {
3042                                                 handle_hotplug_command(
3043                                                     &mut linux,
3044                                                     &mut sys_allocator_mutex.lock(),
3045                                                     &cfg,
3046                                                     &mut add_irq_control_tubes,
3047                                                     &mut add_tubes,
3048                                                     &hp_control_tube,
3049                                                     &iommu_host_tube,
3050                                                     &device,
3051                                                     add,
3052                                                     #[cfg(feature = "swap")]
3053                                                     swap_controller.as_ref(),
3054                                                 )
3055                                             }
3056 
3057                                             #[cfg(not(any(
3058                                                 target_arch = "x86",
3059                                                 target_arch = "x86_64"
3060                                             )))]
3061                                             {
3062                                                 // Suppress warnings.
3063                                                 let _ = (device, add);
3064                                                 VmResponse::Ok
3065                                             }
3066                                         }
3067                                         VmRequest::RegisterListener { socket_addr, event } => {
3068                                             let (registered_tube, already_registered) =
3069                                                 find_registered_tube(
3070                                                     &registered_evt_tubes,
3071                                                     &socket_addr,
3072                                                     event,
3073                                                 );
3074 
3075                                             if !already_registered {
3076                                                 let addr_tube = make_addr_tube_from_maybe_existing(
3077                                                     registered_tube,
3078                                                     socket_addr,
3079                                                 )?;
3080 
3081                                                 if let Some(tubes) =
3082                                                     registered_evt_tubes.get_mut(&event)
3083                                                 {
3084                                                     tubes.insert(addr_tube);
3085                                                 } else {
3086                                                     registered_evt_tubes.insert(
3087                                                         event,
3088                                                         vec![addr_tube].into_iter().collect(),
3089                                                     );
3090                                                 }
3091                                             }
3092                                             VmResponse::Ok
3093                                         }
3094                                         VmRequest::UnregisterListener { socket_addr, event } => {
3095                                             if let Some(tubes) =
3096                                                 registered_evt_tubes.get_mut(&event)
3097                                             {
3098                                                 tubes.retain(|t| t.socket_addr != socket_addr);
3099                                             }
3100                                             registered_evt_tubes
3101                                                 .retain(|_, tubes| !tubes.is_empty());
3102                                             VmResponse::Ok
3103                                         }
3104                                         VmRequest::Unregister { socket_addr } => {
3105                                             for (_, tubes) in registered_evt_tubes.iter_mut() {
3106                                                 tubes.retain(|t| t.socket_addr != socket_addr);
3107                                             }
3108                                             registered_evt_tubes
3109                                                 .retain(|_, tubes| !tubes.is_empty());
3110                                             VmResponse::Ok
3111                                         }
3112                                         _ => {
3113                                             let response = request.execute(
3114                                                 &mut run_mode_opt,
3115                                                 #[cfg(feature = "balloon")]
3116                                                 balloon_host_tube.as_ref(),
3117                                                 #[cfg(feature = "balloon")]
3118                                                 balloon_wss_host_tube.as_ref(),
3119                                                 #[cfg(feature = "balloon")]
3120                                                 &mut balloon_stats_id,
3121                                                 #[cfg(feature = "balloon")]
3122                                                 &mut balloon_wss_id,
3123                                                 disk_host_tubes,
3124                                                 &mut linux.pm,
3125                                                 #[cfg(feature = "gpu")]
3126                                                 &gpu_control_tube,
3127                                                 #[cfg(feature = "usb")]
3128                                                 Some(&usb_control_tube),
3129                                                 #[cfg(not(feature = "usb"))]
3130                                                 None,
3131                                                 &mut linux.bat_control,
3132                                                 |msg| {
3133                                                     vcpu::kick_all_vcpus(
3134                                                         &vcpu_handles,
3135                                                         linux.irq_chip.as_irq_chip(),
3136                                                         msg,
3137                                                     )
3138                                                 },
3139                                                 |msg, index| {
3140                                                     vcpu::kick_vcpu(
3141                                                         &vcpu_handles.get(index),
3142                                                         linux.irq_chip.as_irq_chip(),
3143                                                         msg,
3144                                                     )
3145                                                 },
3146                                                 cfg.force_s2idle,
3147                                                 #[cfg(feature = "swap")]
3148                                                 swap_controller.as_ref(),
3149                                                 &device_ctrl_tube,
3150                                                 vcpu_handles.len(),
3151                                                 &irq_handler_control,
3152                                             );
3153 
3154                                             // For non s2idle guest suspension we are done
3155                                             if let VmRequest::Suspend = request {
3156                                                 if cfg.force_s2idle {
3157                                                     suspend_requested = true;
3158 
3159                                                     // Spawn s2idle wait thread.
3160                                                     let send_tube =
3161                                                         tube.try_clone_send_tube().unwrap();
3162                                                     let suspend_evt =
3163                                                         linux.suspend_evt.try_clone().unwrap();
3164                                                     let guest_suspended_cvar =
3165                                                         guest_suspended_cvar.clone();
3166                                                     let delayed_response = response.clone();
3167                                                     let pm = linux.pm.clone();
3168 
3169                                                     std::thread::Builder::new()
3170                                                         .name("s2idle_wait".to_owned())
3171                                                         .spawn(move || {
3172                                                             trigger_vm_suspend_and_wait_for_entry(
3173                                                                 guest_suspended_cvar,
3174                                                                 &send_tube,
3175                                                                 delayed_response,
3176                                                                 suspend_evt,
3177                                                                 pm,
3178                                                             )
3179                                                         })
3180                                                         .context(
3181                                                             "failed to spawn s2idle_wait thread",
3182                                                         )?;
3183                                                 }
3184                                             }
3185                                             response
3186                                         }
3187                                     };
3188 
3189                                     // If suspend requested skip that step since it will be
3190                                     // performed by s2idle_wait thread when suspension actually
3191                                     // happens.
3192                                     if !suspend_requested {
3193                                         if let Err(e) = tube.send(&response) {
3194                                             error!("failed to send VmResponse: {}", e);
3195                                         }
3196                                     }
3197 
3198                                     if let Some(run_mode) = run_mode_opt {
3199                                         info!("control socket changed run mode to {}", run_mode);
3200                                         match run_mode {
3201                                             VmRunMode::Exiting => {
3202                                                 break 'wait;
3203                                             }
3204                                             other => {
3205                                                 if other == VmRunMode::Running {
3206                                                     for dev in &linux.resume_notify_devices {
3207                                                         dev.lock().resume_imminent();
3208                                                     }
3209                                                 }
3210                                                 // If suspend requested skip that step since it
3211                                                 // will be performed by s2idle_wait thread when
3212                                                 // needed.
3213                                                 if !suspend_requested {
3214                                                     vcpu::kick_all_vcpus(
3215                                                         &vcpu_handles,
3216                                                         linux.irq_chip.as_irq_chip(),
3217                                                         VcpuControl::RunState(other),
3218                                                     );
3219                                                 }
3220                                             }
3221                                         }
3222                                     }
3223                                 }
3224                                 Err(e) => {
3225                                     if let TubeError::Disconnected = e {
3226                                         vm_control_indices_to_remove.push(index);
3227                                     } else {
3228                                         error!("failed to recv VmRequest: {}", e);
3229                                     }
3230                                 }
3231                             },
3232                             TaggedControlTube::VmMemory {
3233                                 tube,
3234                                 expose_with_viommu,
3235                             } => match tube.recv::<VmMemoryRequest>() {
3236                                 Ok(request) => {
3237                                     let response = request.execute(
3238                                         &mut linux.vm,
3239                                         &mut sys_allocator_mutex.lock(),
3240                                         &mut gralloc,
3241                                         if *expose_with_viommu {
3242                                             iommu_client.as_mut()
3243                                         } else {
3244                                             None
3245                                         },
3246                                     );
3247                                     if let Err(e) = tube.send(&response) {
3248                                         error!("failed to send VmMemoryControlResponse: {}", e);
3249                                     }
3250                                 }
3251                                 Err(e) => {
3252                                     if let TubeError::Disconnected = e {
3253                                         vm_control_indices_to_remove.push(index);
3254                                     } else {
3255                                         error!("failed to recv VmMemoryControlRequest: {}", e);
3256                                     }
3257                                 }
3258                             },
3259                             TaggedControlTube::VmMsync(tube) => {
3260                                 match tube.recv::<VmMsyncRequest>() {
3261                                     Ok(request) => {
3262                                         let response = request.execute(&mut linux.vm);
3263                                         if let Err(e) = tube.send(&response) {
3264                                             error!("failed to send VmMsyncResponse: {}", e);
3265                                         }
3266                                     }
3267                                     Err(e) => {
3268                                         if let TubeError::Disconnected = e {
3269                                             vm_control_indices_to_remove.push(index);
3270                                         } else {
3271                                             error!("failed to recv VmMsyncRequest: {}", e);
3272                                         }
3273                                     }
3274                                 }
3275                             }
3276                             TaggedControlTube::Fs(tube) => match tube.recv::<FsMappingRequest>() {
3277                                 Ok(request) => {
3278                                     let response = request
3279                                         .execute(&mut linux.vm, &mut sys_allocator_mutex.lock());
3280                                     if let Err(e) = tube.send(&response) {
3281                                         error!("failed to send VmResponse: {}", e);
3282                                     }
3283                                 }
3284                                 Err(e) => {
3285                                     if let TubeError::Disconnected = e {
3286                                         vm_control_indices_to_remove.push(index);
3287                                     } else {
3288                                         error!("failed to recv VmResponse: {}", e);
3289                                     }
3290                                 }
3291                             },
3292                         }
3293                     }
3294                     #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
3295                     if !add_tubes.is_empty() {
3296                         for (idx, socket) in add_tubes.iter().enumerate() {
3297                             wait_ctx
3298                                 .add(
3299                                     socket.as_ref(),
3300                                     Token::VmControl {
3301                                         index: idx + control_tubes.len(),
3302                                     },
3303                                 )
3304                                 .context(
3305                                     "failed to add hotplug vfio-pci descriptor to wait context",
3306                                 )?;
3307                         }
3308                         control_tubes.append(&mut add_tubes);
3309                     }
3310                     #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
3311                     if !add_irq_control_tubes.is_empty() {
3312                         irq_handler_control.send(&IrqHandlerRequest::AddIrqControlTubes(
3313                             add_irq_control_tubes,
3314                         ))?;
3315                     }
3316                 }
3317             }
3318         }
3319 
3320         remove_hungup_and_drained_tubes(
3321             &events,
3322             &wait_ctx,
3323             &mut control_tubes,
3324             vm_control_indices_to_remove,
3325             |token: &Token| {
3326                 if let Token::VmControl { index } = token {
3327                     return Some(*index);
3328                 }
3329                 None
3330             },
3331             |index: usize| Token::VmControl { index },
3332         )?;
3333     }
3334 
3335     vcpu::kick_all_vcpus(
3336         &vcpu_handles,
3337         linux.irq_chip.as_irq_chip(),
3338         VcpuControl::RunState(VmRunMode::Exiting),
3339     );
3340     for (handle, _) in vcpu_handles {
3341         if let Err(e) = handle.join() {
3342             error!("failed to join vcpu thread: {:?}", e);
3343         }
3344     }
3345 
3346     #[cfg(feature = "swap")]
3347     // Stop the snapshot monitor process
3348     if let Some(swap_controller) = swap_controller {
3349         if let Err(e) = swap_controller.exit() {
3350             error!("failed to exit snapshot monitor process: {:?}", e);
3351         }
3352     }
3353 
3354     // Stop pci root worker thread
3355     #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
3356     {
3357         let _ = hp_control_tube.send(PciRootCommand::Kill);
3358         if let Err(e) = hp_thread.join() {
3359             error!("failed to join hotplug thread: {:?}", e);
3360         }
3361     }
3362 
3363     if linux.devices_thread.is_some() {
3364         if let Err(e) = device_ctrl_tube.send(&DeviceControlCommand::Exit) {
3365             error!("failed to stop device control loop: {}", e);
3366         };
3367         if let Some(thread) = linux.devices_thread.take() {
3368             if let Err(e) = thread.join() {
3369                 error!("failed to exit devices thread: {:?}", e);
3370             }
3371         }
3372     }
3373 
3374     // Shut down the IRQ handler thread.
3375     if let Err(e) = irq_handler_control.send(&IrqHandlerRequest::Exit) {
3376         error!("failed to request exit from IRQ handler thread: {}", e);
3377     }
3378     if let Err(e) = irq_handler_thread.join() {
3379         error!("failed to exit irq handler thread: {:?}", e);
3380     }
3381 
3382     // At this point, the only remaining `Arc` references to the `Bus` objects should be the ones
3383     // inside `linux`. If the checks below fail, then some other thread is probably still running
3384     // and needs to be explicitly stopped before dropping `linux` to ensure devices actually get
3385     // cleaned up.
3386     match Arc::try_unwrap(std::mem::replace(&mut linux.mmio_bus, Arc::new(Bus::new()))) {
3387         Ok(_) => {}
3388         Err(_) => panic!("internal error: mmio_bus had more than one reference at shutdown"),
3389     }
3390     match Arc::try_unwrap(std::mem::replace(&mut linux.io_bus, Arc::new(Bus::new()))) {
3391         Ok(_) => {}
3392         Err(_) => panic!("internal error: io_bus had more than one reference at shutdown"),
3393     }
3394 
3395     // Explicitly drop the VM structure here to allow the devices to clean up before the
3396     // control sockets are closed when this function exits.
3397     mem::drop(linux);
3398 
3399     stdin()
3400         .set_canon_mode()
3401         .expect("failed to restore canonical mode for terminal");
3402 
3403     Ok(exit_state)
3404 }
3405 
3406 #[derive(EventToken)]
3407 enum IrqHandlerToken {
3408     IrqFd { index: IrqEventIndex },
3409     VmIrq { index: usize },
3410     DelayedIrqFd,
3411     HandlerControl,
3412 }
3413 
3414 /// Handles IRQs and requests from devices to add additional IRQ lines.
irq_handler_thread( mut irq_control_tubes: Vec<Tube>, mut irq_chip: Box<dyn IrqChipArch + 'static>, sys_allocator_mutex: Arc<Mutex<SystemAllocator>>, handler_control: Tube, ) -> anyhow::Result<()>3415 fn irq_handler_thread(
3416     mut irq_control_tubes: Vec<Tube>,
3417     mut irq_chip: Box<dyn IrqChipArch + 'static>,
3418     sys_allocator_mutex: Arc<Mutex<SystemAllocator>>,
3419     handler_control: Tube,
3420 ) -> anyhow::Result<()> {
3421     let wait_ctx = WaitContext::build_with(&[(
3422         handler_control.get_read_notifier(),
3423         IrqHandlerToken::HandlerControl,
3424     )])
3425     .context("failed to build wait context")?;
3426 
3427     if let Some(delayed_ioapic_irq_trigger) = irq_chip.irq_delayed_event_token()? {
3428         wait_ctx
3429             .add(&delayed_ioapic_irq_trigger, IrqHandlerToken::DelayedIrqFd)
3430             .context("failed to add descriptor to wait context")?;
3431     }
3432 
3433     let events = irq_chip
3434         .irq_event_tokens()
3435         .context("failed get event tokens from irqchip")?;
3436 
3437     for (index, _gsi, evt) in events {
3438         wait_ctx
3439             .add(&evt, IrqHandlerToken::IrqFd { index })
3440             .context("failed to add irq chip event tokens to wait context")?;
3441     }
3442 
3443     for (index, socket) in irq_control_tubes.iter().enumerate() {
3444         wait_ctx
3445             .add(socket.get_read_notifier(), IrqHandlerToken::VmIrq { index })
3446             .context("irq control tubes to wait context")?;
3447     }
3448 
3449     'wait: loop {
3450         let events = {
3451             match wait_ctx.wait() {
3452                 Ok(v) => v,
3453                 Err(e) => {
3454                     error!("failed to poll: {}", e);
3455                     break 'wait;
3456                 }
3457             }
3458         };
3459         let token_count = events.len();
3460         let mut vm_irq_tubes_to_remove = Vec::new();
3461         let mut notify_control_on_iteration_end = false;
3462 
3463         for event in events.iter().filter(|e| e.is_readable) {
3464             match event.token {
3465                 IrqHandlerToken::HandlerControl => {
3466                     match handler_control.recv::<IrqHandlerRequest>() {
3467                         Ok(request) => {
3468                             match request {
3469                                 IrqHandlerRequest::Exit => break 'wait,
3470                                 IrqHandlerRequest::AddIrqControlTubes(mut tubes) => {
3471                                     for (index, socket) in tubes.iter().enumerate() {
3472                                         wait_ctx
3473                                         .add(
3474                                             socket.get_read_notifier(),
3475                                             IrqHandlerToken::VmIrq {
3476                                                 index: irq_control_tubes.len() + index,
3477                                             },
3478                                         )
3479                                         .context("failed to add new IRQ control Tube to wait context")?;
3480                                     }
3481                                     irq_control_tubes.append(&mut tubes);
3482                                 }
3483                                 IrqHandlerRequest::WakeAndNotifyIteration => {
3484                                     notify_control_on_iteration_end = true;
3485                                 }
3486                             }
3487                         }
3488                         Err(e) => {
3489                             if let TubeError::Disconnected = e {
3490                                 panic!("irq handler control tube disconnected.");
3491                             } else {
3492                                 error!("failed to recv IrqHandlerRequest: {}", e);
3493                             }
3494                         }
3495                     }
3496                 }
3497                 IrqHandlerToken::VmIrq { index } => {
3498                     if let Some(tube) = irq_control_tubes.get(index) {
3499                         handle_irq_tube_request(
3500                             &sys_allocator_mutex,
3501                             &mut irq_chip,
3502                             &mut vm_irq_tubes_to_remove,
3503                             &wait_ctx,
3504                             tube,
3505                             index,
3506                         );
3507                     }
3508                 }
3509                 IrqHandlerToken::IrqFd { index } => {
3510                     if let Err(e) = irq_chip.service_irq_event(index) {
3511                         error!("failed to signal irq {}: {}", index, e);
3512                     }
3513                 }
3514                 IrqHandlerToken::DelayedIrqFd => {
3515                     if let Err(e) = irq_chip.process_delayed_irq_events() {
3516                         warn!("can't deliver delayed irqs: {}", e);
3517                     }
3518                 }
3519             }
3520         }
3521 
3522         if notify_control_on_iteration_end {
3523             if let Err(e) = handler_control.send(&IrqHandlerResponse::HandlerIterationComplete(
3524                 token_count - 1,
3525             )) {
3526                 error!(
3527                     "failed to notify on iteration completion (snapshotting may fail): {}",
3528                     e
3529                 );
3530             }
3531         }
3532 
3533         remove_hungup_and_drained_tubes(
3534             &events,
3535             &wait_ctx,
3536             &mut irq_control_tubes,
3537             vm_irq_tubes_to_remove,
3538             |token: &IrqHandlerToken| {
3539                 if let IrqHandlerToken::VmIrq { index } = token {
3540                     return Some(*index);
3541                 }
3542                 None
3543             },
3544             |index: usize| IrqHandlerToken::VmIrq { index },
3545         )?;
3546         if events.iter().any(|e| {
3547             e.is_hungup && !e.is_readable && matches!(e.token, IrqHandlerToken::HandlerControl)
3548         }) {
3549             error!("IRQ handler control hung up but did not request an exit.");
3550             break 'wait;
3551         }
3552     }
3553     Ok(())
3554 }
3555 
handle_irq_tube_request( sys_allocator_mutex: &Arc<Mutex<SystemAllocator>>, irq_chip: &mut Box<dyn IrqChipArch + 'static>, vm_irq_tubes_to_remove: &mut Vec<usize>, wait_ctx: &WaitContext<IrqHandlerToken>, tube: &Tube, tube_index: usize, )3556 fn handle_irq_tube_request(
3557     sys_allocator_mutex: &Arc<Mutex<SystemAllocator>>,
3558     irq_chip: &mut Box<dyn IrqChipArch + 'static>,
3559     vm_irq_tubes_to_remove: &mut Vec<usize>,
3560     wait_ctx: &WaitContext<IrqHandlerToken>,
3561     tube: &Tube,
3562     tube_index: usize,
3563 ) {
3564     match tube.recv::<VmIrqRequest>() {
3565         Ok(request) => {
3566             let response = {
3567                 request.execute(
3568                     |setup| match setup {
3569                         IrqSetup::Event(irq, ev, device_id, queue_id, device_name) => {
3570                             let irq_evt = devices::IrqEdgeEvent::from_event(ev.try_clone()?);
3571                             let source = IrqEventSource {
3572                                 device_id: device_id.try_into().expect("Invalid device_id"),
3573                                 queue_id,
3574                                 device_name,
3575                             };
3576                             if let Some(event_index) =
3577                                 irq_chip.register_edge_irq_event(irq, &irq_evt, source)?
3578                             {
3579                                 if let Err(e) =
3580                                     wait_ctx.add(ev, IrqHandlerToken::IrqFd { index: event_index })
3581                                 {
3582                                     warn!("failed to add IrqFd to poll context: {}", e);
3583                                     return Err(e);
3584                                 }
3585                             }
3586                             Ok(())
3587                         }
3588                         IrqSetup::Route(route) => irq_chip.route_irq(route),
3589                         IrqSetup::UnRegister(irq, ev) => {
3590                             let irq_evt = devices::IrqEdgeEvent::from_event(ev.try_clone()?);
3591                             irq_chip.unregister_edge_irq_event(irq, &irq_evt)
3592                         }
3593                     },
3594                     &mut sys_allocator_mutex.lock(),
3595                 )
3596             };
3597             if let Err(e) = tube.send(&response) {
3598                 error!("failed to send VmIrqResponse: {}", e);
3599             }
3600         }
3601         Err(e) => {
3602             if let TubeError::Disconnected = e {
3603                 vm_irq_tubes_to_remove.push(tube_index);
3604             } else {
3605                 error!("failed to recv VmIrqRequest: {}", e);
3606             }
3607         }
3608     }
3609 }
3610 
3611 /// When control tubes hang up, we want to make sure that we've fully drained
3612 /// the underlying socket before removing it. This function also handles
3613 /// removing closed sockets in such a way that avoids phantom events.
3614 ///
3615 /// `tube_indices_to_remove` is the set of indices that we already know should
3616 /// be removed (e.g. from getting a disconnect error on read).
remove_hungup_and_drained_tubes<T, U>( events: &SmallVec<[TriggeredEvent<T>; 16]>, wait_ctx: &WaitContext<T>, tubes: &mut Vec<U>, mut tube_indices_to_remove: Vec<usize>, get_tube_index: fn(token: &T) -> Option<usize>, make_token_for_tube: fn(usize) -> T, ) -> anyhow::Result<()> where T: EventToken, U: ReadNotifier,3617 fn remove_hungup_and_drained_tubes<T, U>(
3618     events: &SmallVec<[TriggeredEvent<T>; 16]>,
3619     wait_ctx: &WaitContext<T>,
3620     tubes: &mut Vec<U>,
3621     mut tube_indices_to_remove: Vec<usize>,
3622     get_tube_index: fn(token: &T) -> Option<usize>,
3623     make_token_for_tube: fn(usize) -> T,
3624 ) -> anyhow::Result<()>
3625 where
3626     T: EventToken,
3627     U: ReadNotifier,
3628 {
3629     // It's possible more data is readable and buffered while the socket is hungup,
3630     // so don't delete the tube from the poll context until we're sure all the
3631     // data is read.
3632     // Below case covers a condition where we have received a hungup event and the tube is not
3633     // readable.
3634     // In case of readable tube, once all data is read, any attempt to read more data on hungup
3635     // tube should fail. On such failure, we get Disconnected error and index gets added to
3636     // vm_control_indices_to_remove by the time we reach here.
3637     for event in events.iter().filter(|e| e.is_hungup && !e.is_readable) {
3638         if let Some(index) = get_tube_index(&event.token) {
3639             tube_indices_to_remove.push(index);
3640         }
3641     }
3642 
3643     // Sort in reverse so the highest indexes are removed first. This removal algorithm
3644     // preserves correct indexes as each element is removed.
3645     tube_indices_to_remove.sort_unstable_by_key(|&k| Reverse(k));
3646     tube_indices_to_remove.dedup();
3647     for index in tube_indices_to_remove {
3648         // Delete the socket from the `wait_ctx` synchronously. Otherwise, the kernel will do
3649         // this automatically when the FD inserted into the `wait_ctx` is closed after this
3650         // if-block, but this removal can be deferred unpredictably. In some instances where the
3651         // system is under heavy load, we can even get events returned by `wait_ctx` for an FD
3652         // that has already been closed. Because the token associated with that spurious event
3653         // now belongs to a different socket, the control loop will start to interact with
3654         // sockets that might not be ready to use. This can cause incorrect hangup detection or
3655         // blocking on a socket that will never be ready. See also: crbug.com/1019986
3656         if let Some(socket) = tubes.get(index) {
3657             wait_ctx
3658                 .delete(socket.get_read_notifier())
3659                 .context("failed to remove descriptor from wait context")?;
3660         }
3661 
3662         // This line implicitly drops the socket at `index` when it gets returned by
3663         // `swap_remove`. After this line, the socket at `index` is not the one from
3664         // `tube_indices_to_remove`. Because of this socket's change in index, we need to
3665         // use `wait_ctx.modify` to change the associated index in its `Token::VmControl`.
3666         tubes.swap_remove(index);
3667         if let Some(tube) = tubes.get(index) {
3668             wait_ctx
3669                 .modify(
3670                     tube.get_read_notifier(),
3671                     EventType::Read,
3672                     make_token_for_tube(index),
3673                 )
3674                 .context("failed to add descriptor to wait context")?;
3675         }
3676     }
3677     Ok(())
3678 }
3679 
3680 /// Start and jail a vhost-user device according to its configuration and a vhost listener string.
3681 ///
3682 /// The jailing business is nasty and potentially unsafe if done from the wrong context - do not
3683 /// call outside of `start_devices`!
3684 ///
3685 /// Returns the pid of the jailed device process.
jail_and_start_vu_device<T: VirtioDeviceBuilder>( jail_config: &Option<JailConfig>, params: T, vhost: &str, name: &str, ) -> anyhow::Result<(libc::pid_t, Option<Box<dyn std::any::Any>>)>3686 fn jail_and_start_vu_device<T: VirtioDeviceBuilder>(
3687     jail_config: &Option<JailConfig>,
3688     params: T,
3689     vhost: &str,
3690     name: &str,
3691 ) -> anyhow::Result<(libc::pid_t, Option<Box<dyn std::any::Any>>)> {
3692     let mut keep_rds = Vec::new();
3693 
3694     base::syslog::push_descriptors(&mut keep_rds);
3695     cros_tracing::push_descriptors!(&mut keep_rds);
3696 
3697     let jail_type = VhostUserListener::get_virtio_transport_type(vhost);
3698 
3699     // Create a jail from the configuration. If the configuration is `None`, `create_jail` will also
3700     // return `None` so fall back to an empty (i.e. non-constrained) Minijail.
3701     let jail = params
3702         .create_jail(jail_config, jail_type)
3703         .with_context(|| format!("failed to create jail for {}", name))?
3704         .ok_or(())
3705         .or_else(|_| Minijail::new())
3706         .with_context(|| format!("failed to create empty jail for {}", name))?;
3707 
3708     // Create the device in the parent process, so the child does not need any privileges necessary
3709     // to do it (only runtime capabilities are required).
3710     let device = params
3711         .create_vhost_user_device(&mut keep_rds)
3712         .context("failed to create vhost-user device")?;
3713     let mut listener = VhostUserListener::new(vhost, device.max_queue_num(), Some(&mut keep_rds))
3714         .context("failed to create the vhost listener")?;
3715     let parent_resources = listener.take_parent_process_resources();
3716 
3717     let tz = std::env::var("TZ").unwrap_or_default();
3718 
3719     // Executor must be created before jail in order to prevent the jailed process from creating
3720     // unrestricted io_urings.
3721     let ex = Executor::with_executor_kind(device.executor_kind().unwrap_or_default())
3722         .context("Failed to create an Executor")?;
3723     keep_rds.extend(ex.as_raw_descriptors());
3724 
3725     // Deduplicate the FDs since minijail expects them to be unique.
3726     keep_rds.sort_unstable();
3727     keep_rds.dedup();
3728 
3729     // Safe because we are keeping all the descriptors needed for the child to function.
3730     match unsafe { jail.fork(Some(&keep_rds)).context("error while forking")? } {
3731         0 => {
3732             // In the child process.
3733 
3734             // Free memory for the resources managed by the parent, without running drop() on them.
3735             // The parent will do it as we exit.
3736             let _ = std::mem::ManuallyDrop::new(parent_resources);
3737 
3738             // Make sure the child process does not survive its parent.
3739             if unsafe { libc::prctl(libc::PR_SET_PDEATHSIG, libc::SIGKILL) } < 0 {
3740                 panic!("call to prctl(PR_SET_DEATHSIG, SIGKILL) failed. Aborting child process.");
3741             }
3742 
3743             // Set the name for the thread.
3744             const MAX_LEN: usize = 15; // pthread_setname_np() limit on Linux
3745             let debug_label_trimmed = &name.as_bytes()[..std::cmp::min(MAX_LEN, name.len())];
3746             let thread_name = CString::new(debug_label_trimmed).unwrap();
3747             // Safe because we trimmed the name to 15 characters (and pthread_setname_np will return
3748             // an error if we don't anyway).
3749             let _ = unsafe { libc::pthread_setname_np(libc::pthread_self(), thread_name.as_ptr()) };
3750 
3751             // Preserve TZ for `chrono::Local` (b/257987535).
3752             std::env::set_var("TZ", tz);
3753 
3754             // Run the device loop and terminate the child process once it exits.
3755             let res = match listener.run_device(ex, device) {
3756                 Ok(()) => 0,
3757                 Err(e) => {
3758                     error!("error while running device {}: {:#}", name, e);
3759                     1
3760                 }
3761             };
3762             unsafe { libc::exit(res) };
3763         }
3764         pid => {
3765             // In the parent process. We will drop the device and listener when exiting this method.
3766             // This is fine as ownership for both has been transferred to the child process and they
3767             // will keep living there. We just retain `parent_resources` for things we are supposed
3768             // to clean up ourselves.
3769 
3770             info!("process for device {} (PID {}) started", &name, pid);
3771             #[cfg(feature = "seccomp_trace")]
3772             debug!(
3773                     "seccomp_trace {{\"event\": \"minijail_fork\", \"pid\": {}, \"name\": \"{}\", \"jail_addr\": \"0x{:x}\"}}",
3774                     pid,
3775                     &name,
3776                     read_jail_addr(&jail)
3777                 );
3778             Ok((pid, parent_resources))
3779         }
3780     }
3781 }
3782 
process_vhost_user_control_request(tube: Tube, disk_host_tubes: &[Tube]) -> Result<()>3783 fn process_vhost_user_control_request(tube: Tube, disk_host_tubes: &[Tube]) -> Result<()> {
3784     let command = tube
3785         .recv::<VmRequest>()
3786         .context("failed to receive VmRequest")?;
3787     let resp = match command {
3788         VmRequest::DiskCommand {
3789             disk_index,
3790             ref command,
3791         } => match &disk_host_tubes.get(disk_index) {
3792             Some(tube) => handle_disk_command(command, tube),
3793             None => VmResponse::Err(base::Error::new(libc::ENODEV)),
3794         },
3795         request => {
3796             error!(
3797                 "Request {:?} currently not supported in vhost user backend",
3798                 request
3799             );
3800             VmResponse::Err(base::Error::new(libc::EPERM))
3801         }
3802     };
3803 
3804     tube.send(&resp).context("failed to send VmResponse")?;
3805     Ok(())
3806 }
3807 
start_vhost_user_control_server( control_server_socket: UnlinkUnixSeqpacketListener, disk_host_tubes: Vec<Tube>, )3808 fn start_vhost_user_control_server(
3809     control_server_socket: UnlinkUnixSeqpacketListener,
3810     disk_host_tubes: Vec<Tube>,
3811 ) {
3812     info!("Start vhost-user control server");
3813     loop {
3814         match control_server_socket.accept() {
3815             Ok(socket) => {
3816                 let tube = Tube::new_from_unix_seqpacket(socket);
3817                 if let Err(e) = process_vhost_user_control_request(tube, &disk_host_tubes) {
3818                     error!("failed to process control request: {:#}", e);
3819                 }
3820             }
3821             Err(e) => {
3822                 error!("failed to establish connection: {}", e);
3823             }
3824         }
3825     }
3826 }
3827 
start_devices(opts: DevicesCommand) -> anyhow::Result<()>3828 pub fn start_devices(opts: DevicesCommand) -> anyhow::Result<()> {
3829     if let Some(async_executor) = opts.async_executor {
3830         Executor::set_default_executor_kind(async_executor)
3831             .context("Failed to set the default async executor")?;
3832     }
3833 
3834     struct DeviceJailInfo {
3835         // Unique name for the device, in the form `foomatic-0`.
3836         name: String,
3837         _drop_resources: Option<Box<dyn std::any::Any>>,
3838     }
3839 
3840     fn add_device<T: VirtioDeviceBuilder>(
3841         i: usize,
3842         device_params: T,
3843         vhost: &str,
3844         jail_config: &Option<JailConfig>,
3845         devices_jails: &mut BTreeMap<libc::pid_t, DeviceJailInfo>,
3846     ) -> anyhow::Result<()> {
3847         let name = format!("{}-{}", T::NAME, i);
3848 
3849         let (pid, _drop_resources) =
3850             jail_and_start_vu_device::<T>(jail_config, device_params, vhost, &name)?;
3851 
3852         devices_jails.insert(
3853             pid,
3854             DeviceJailInfo {
3855                 name,
3856                 _drop_resources,
3857             },
3858         );
3859 
3860         Ok(())
3861     }
3862 
3863     let mut devices_jails: BTreeMap<libc::pid_t, DeviceJailInfo> = BTreeMap::new();
3864 
3865     let jail = if opts.disable_sandbox {
3866         None
3867     } else {
3868         Some(opts.jail)
3869     };
3870 
3871     // Create control server socket
3872     let control_server_socket = opts.control_socket.map(|path| {
3873         UnlinkUnixSeqpacketListener(
3874             UnixSeqpacketListener::bind(path).expect("Could not bind socket"),
3875         )
3876     });
3877 
3878     // Create serial devices.
3879     for (i, params) in opts.serial.iter().enumerate() {
3880         let serial_config = &params.device;
3881         add_device(i, serial_config, &params.vhost, &jail, &mut devices_jails)?;
3882     }
3883 
3884     let mut disk_host_tubes = Vec::new();
3885     let control_socket_exists = control_server_socket.is_some();
3886     // Create block devices.
3887     for (i, params) in opts.block.iter().enumerate() {
3888         let tube = if control_socket_exists {
3889             let (host_tube, device_tube) = Tube::pair().context("failed to create tube")?;
3890             disk_host_tubes.push(host_tube);
3891             Some(device_tube)
3892         } else {
3893             None
3894         };
3895         let disk_config = DiskConfig::new(&params.device, tube);
3896         add_device(i, disk_config, &params.vhost, &jail, &mut devices_jails)?;
3897     }
3898 
3899     // Create vsock devices.
3900     for (i, params) in opts.vsock.iter().enumerate() {
3901         add_device(i, &params.device, &params.vhost, &jail, &mut devices_jails)?;
3902     }
3903 
3904     let ex = Executor::new()?;
3905     if let Some(control_server_socket) = control_server_socket {
3906         // Start the control server in the parent process.
3907         ex.spawn_blocking(move || {
3908             start_vhost_user_control_server(control_server_socket, disk_host_tubes)
3909         })
3910         .detach();
3911     }
3912 
3913     // Now wait for all device processes to return.
3914     while !devices_jails.is_empty() {
3915         match base::platform::wait_for_pid(-1, 0) {
3916             Err(e) => panic!("error waiting for child process to complete: {:#}", e),
3917             Ok((Some(pid), wait_status)) => match devices_jails.remove_entry(&pid) {
3918                 Some((_, info)) => {
3919                     if let Some(status) = wait_status.code() {
3920                         info!(
3921                             "process for device {} (PID {}) exited with code {}",
3922                             &info.name, pid, status
3923                         );
3924                     } else if let Some(signal) = wait_status.signal() {
3925                         warn!(
3926                             "process for device {} (PID {}) has been killed by signal {:?}",
3927                             &info.name, pid, signal,
3928                         );
3929                     }
3930                 }
3931                 None => error!("pid {} is not one of our device processes", pid),
3932             },
3933             // `wait_for_pid` will necessarily return a PID because we asked to it wait for one to
3934             // complete.
3935             Ok((None, _)) => unreachable!(),
3936         }
3937     }
3938 
3939     info!("all device processes have exited");
3940 
3941     Ok(())
3942 }
3943 
3944 /// Setup crash reporting for a process. Each process MUST provide a unique `product_type` to avoid
3945 /// making crash reports incomprehensible.
3946 #[cfg(feature = "crash-report")]
setup_emulator_crash_reporting(_cfg: &Config) -> anyhow::Result<String>3947 pub fn setup_emulator_crash_reporting(_cfg: &Config) -> anyhow::Result<String> {
3948     crash_report::setup_crash_reporting(crash_report::CrashReportAttributes {
3949         product_type: "emulator".to_owned(),
3950         pipe_name: None,
3951         report_uuid: None,
3952         product_name: None,
3953         product_version: None,
3954     })
3955 }
3956 
3957 #[cfg(test)]
3958 mod tests {
3959     use std::path::PathBuf;
3960 
3961     use super::*;
3962 
3963     // Create a file-backed mapping parameters struct with the given `address` and `size` and other
3964     // parameters set to default values.
test_file_backed_mapping(address: u64, size: u64) -> FileBackedMappingParameters3965     fn test_file_backed_mapping(address: u64, size: u64) -> FileBackedMappingParameters {
3966         FileBackedMappingParameters {
3967             address,
3968             size,
3969             path: PathBuf::new(),
3970             offset: 0,
3971             writable: false,
3972             sync: false,
3973             align: false,
3974         }
3975     }
3976 
3977     #[test]
guest_mem_file_backed_mappings_overlap()3978     fn guest_mem_file_backed_mappings_overlap() {
3979         // Base case: no file mappings; output layout should be identical.
3980         assert_eq!(
3981             punch_holes_in_guest_mem_layout_for_mappings(
3982                 vec![
3983                     (GuestAddress(0), 0xD000_0000, Default::default()),
3984                     (GuestAddress(0x1_0000_0000), 0x8_0000, Default::default()),
3985                 ],
3986                 &[]
3987             ),
3988             vec![
3989                 (GuestAddress(0), 0xD000_0000, Default::default()),
3990                 (GuestAddress(0x1_0000_0000), 0x8_0000, Default::default()),
3991             ]
3992         );
3993 
3994         // File mapping that does not overlap guest memory.
3995         assert_eq!(
3996             punch_holes_in_guest_mem_layout_for_mappings(
3997                 vec![
3998                     (GuestAddress(0), 0xD000_0000, Default::default()),
3999                     (GuestAddress(0x1_0000_0000), 0x8_0000, Default::default()),
4000                 ],
4001                 &[test_file_backed_mapping(0xD000_0000, 0x1000)]
4002             ),
4003             vec![
4004                 (GuestAddress(0), 0xD000_0000, Default::default()),
4005                 (GuestAddress(0x1_0000_0000), 0x8_0000, Default::default()),
4006             ]
4007         );
4008 
4009         // File mapping at the start of the low address space region.
4010         assert_eq!(
4011             punch_holes_in_guest_mem_layout_for_mappings(
4012                 vec![
4013                     (GuestAddress(0), 0xD000_0000, Default::default()),
4014                     (GuestAddress(0x1_0000_0000), 0x8_0000, Default::default()),
4015                 ],
4016                 &[test_file_backed_mapping(0, 0x2000)]
4017             ),
4018             vec![
4019                 (
4020                     GuestAddress(0x2000),
4021                     0xD000_0000 - 0x2000,
4022                     Default::default()
4023                 ),
4024                 (GuestAddress(0x1_0000_0000), 0x8_0000, Default::default()),
4025             ]
4026         );
4027 
4028         // File mapping at the end of the low address space region.
4029         assert_eq!(
4030             punch_holes_in_guest_mem_layout_for_mappings(
4031                 vec![
4032                     (GuestAddress(0), 0xD000_0000, Default::default()),
4033                     (GuestAddress(0x1_0000_0000), 0x8_0000, Default::default()),
4034                 ],
4035                 &[test_file_backed_mapping(0xD000_0000 - 0x2000, 0x2000)]
4036             ),
4037             vec![
4038                 (GuestAddress(0), 0xD000_0000 - 0x2000, Default::default()),
4039                 (GuestAddress(0x1_0000_0000), 0x8_0000, Default::default()),
4040             ]
4041         );
4042 
4043         // File mapping fully contained within the middle of the low address space region.
4044         assert_eq!(
4045             punch_holes_in_guest_mem_layout_for_mappings(
4046                 vec![
4047                     (GuestAddress(0), 0xD000_0000, Default::default()),
4048                     (GuestAddress(0x1_0000_0000), 0x8_0000, Default::default()),
4049                 ],
4050                 &[test_file_backed_mapping(0x1000, 0x2000)]
4051             ),
4052             vec![
4053                 (GuestAddress(0), 0x1000, Default::default()),
4054                 (
4055                     GuestAddress(0x3000),
4056                     0xD000_0000 - 0x3000,
4057                     Default::default()
4058                 ),
4059                 (GuestAddress(0x1_0000_0000), 0x8_0000, Default::default()),
4060             ]
4061         );
4062 
4063         // File mapping at the start of the high address space region.
4064         assert_eq!(
4065             punch_holes_in_guest_mem_layout_for_mappings(
4066                 vec![
4067                     (GuestAddress(0), 0xD000_0000, Default::default()),
4068                     (GuestAddress(0x1_0000_0000), 0x8_0000, Default::default()),
4069                 ],
4070                 &[test_file_backed_mapping(0x1_0000_0000, 0x2000)]
4071             ),
4072             vec![
4073                 (GuestAddress(0), 0xD000_0000, Default::default()),
4074                 (
4075                     GuestAddress(0x1_0000_2000),
4076                     0x8_0000 - 0x2000,
4077                     Default::default()
4078                 ),
4079             ]
4080         );
4081 
4082         // File mapping at the end of the high address space region.
4083         assert_eq!(
4084             punch_holes_in_guest_mem_layout_for_mappings(
4085                 vec![
4086                     (GuestAddress(0), 0xD000_0000, Default::default()),
4087                     (GuestAddress(0x1_0000_0000), 0x8_0000, Default::default()),
4088                 ],
4089                 &[test_file_backed_mapping(0x1_0008_0000 - 0x2000, 0x2000)]
4090             ),
4091             vec![
4092                 (GuestAddress(0), 0xD000_0000, Default::default()),
4093                 (
4094                     GuestAddress(0x1_0000_0000),
4095                     0x8_0000 - 0x2000,
4096                     Default::default()
4097                 ),
4098             ]
4099         );
4100 
4101         // File mapping fully contained within the middle of the high address space region.
4102         assert_eq!(
4103             punch_holes_in_guest_mem_layout_for_mappings(
4104                 vec![
4105                     (GuestAddress(0), 0xD000_0000, Default::default()),
4106                     (GuestAddress(0x1_0000_0000), 0x8_0000, Default::default()),
4107                 ],
4108                 &[test_file_backed_mapping(0x1_0000_1000, 0x2000)]
4109             ),
4110             vec![
4111                 (GuestAddress(0), 0xD000_0000, Default::default()),
4112                 (GuestAddress(0x1_0000_0000), 0x1000, Default::default()),
4113                 (
4114                     GuestAddress(0x1_0000_3000),
4115                     0x8_0000 - 0x3000,
4116                     Default::default()
4117                 ),
4118             ]
4119         );
4120 
4121         // File mapping overlapping two guest memory regions.
4122         assert_eq!(
4123             punch_holes_in_guest_mem_layout_for_mappings(
4124                 vec![
4125                     (GuestAddress(0), 0xD000_0000, Default::default()),
4126                     (GuestAddress(0x1_0000_0000), 0x8_0000, Default::default()),
4127                 ],
4128                 &[test_file_backed_mapping(0xA000_0000, 0x60002000)]
4129             ),
4130             vec![
4131                 (GuestAddress(0), 0xA000_0000, Default::default()),
4132                 (
4133                     GuestAddress(0x1_0000_2000),
4134                     0x8_0000 - 0x2000,
4135                     Default::default()
4136                 ),
4137             ]
4138         );
4139     }
4140 }
4141