1 // Copyright 2022 The ChromiumOS Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #[cfg(target_os = "android")]
6 mod android;
7 pub mod cmdline;
8 pub mod config;
9 mod device_helpers;
10 #[cfg(feature = "gpu")]
11 pub(crate) mod gpu;
12 #[cfg(feature = "pci-hotplug")]
13 pub(crate) mod jail_warden;
14 #[cfg(feature = "pci-hotplug")]
15 pub(crate) mod pci_hotplug_helpers;
16 #[cfg(feature = "pci-hotplug")]
17 pub(crate) mod pci_hotplug_manager;
18 mod vcpu;
19
20 use std::cmp::max;
21 use std::collections::BTreeMap;
22 use std::collections::BTreeSet;
23 #[cfg(feature = "registered_events")]
24 use std::collections::HashMap;
25 #[cfg(feature = "registered_events")]
26 use std::collections::HashSet;
27 use std::convert::TryInto;
28 use std::ffi::CString;
29 use std::fs::File;
30 use std::fs::OpenOptions;
31 #[cfg(feature = "registered_events")]
32 use std::hash::Hash;
33 use std::io::prelude::*;
34 use std::io::stdin;
35 use std::iter;
36 use std::mem;
37 #[cfg(target_arch = "x86_64")]
38 use std::ops::RangeInclusive;
39 use std::os::unix::prelude::OpenOptionsExt;
40 use std::os::unix::process::ExitStatusExt;
41 use std::path::Path;
42 use std::process;
43 #[cfg(feature = "registered_events")]
44 use std::rc::Rc;
45 use std::sync::mpsc;
46 use std::sync::Arc;
47 use std::sync::Barrier;
48 use std::thread::JoinHandle;
49
50 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
51 use aarch64::AArch64 as Arch;
52 use acpi_tables::sdt::SDT;
53 use anyhow::anyhow;
54 use anyhow::bail;
55 use anyhow::Context;
56 use anyhow::Result;
57 use arch::DtbOverlay;
58 use arch::IrqChipArch;
59 use arch::LinuxArch;
60 use arch::RunnableLinuxVm;
61 use arch::VcpuAffinity;
62 use arch::VcpuArch;
63 use arch::VirtioDeviceStub;
64 use arch::VmArch;
65 use arch::VmComponents;
66 use arch::VmImage;
67 use argh::FromArgs;
68 use base::ReadNotifier;
69 #[cfg(feature = "balloon")]
70 use base::UnixSeqpacket;
71 use base::UnixSeqpacketListener;
72 use base::UnlinkUnixSeqpacketListener;
73 use base::*;
74 use cros_async::Executor;
75 use device_helpers::*;
76 use devices::create_devices_worker_thread;
77 use devices::serial_device::SerialHardware;
78 #[cfg(feature = "pvclock")]
79 use devices::tsc::get_tsc_sync_mitigations;
80 use devices::vfio::VfioCommonSetup;
81 use devices::vfio::VfioCommonTrait;
82 #[cfg(feature = "gpu")]
83 use devices::virtio;
84 #[cfg(any(feature = "video-decoder", feature = "video-encoder"))]
85 use devices::virtio::device_constants::video::VideoDeviceType;
86 #[cfg(feature = "gpu")]
87 use devices::virtio::gpu::EventDevice;
88 #[cfg(target_arch = "x86_64")]
89 use devices::virtio::memory_mapper::MemoryMapper;
90 use devices::virtio::memory_mapper::MemoryMapperTrait;
91 use devices::virtio::vhost::user::VhostUserListener;
92 use devices::virtio::vhost::user::VhostUserListenerTrait;
93 #[cfg(feature = "balloon")]
94 use devices::virtio::BalloonFeatures;
95 #[cfg(feature = "balloon")]
96 use devices::virtio::BalloonMode;
97 #[cfg(feature = "pci-hotplug")]
98 use devices::virtio::NetParameters;
99 #[cfg(feature = "pci-hotplug")]
100 use devices::virtio::NetParametersMode;
101 use devices::virtio::VirtioDevice;
102 use devices::virtio::VirtioDeviceType;
103 use devices::virtio::VirtioTransportType;
104 use devices::Bus;
105 use devices::BusDeviceObj;
106 use devices::BusType;
107 use devices::CoIommuDev;
108 #[cfg(feature = "usb")]
109 use devices::DeviceProvider;
110 #[cfg(target_arch = "x86_64")]
111 use devices::HotPlugBus;
112 #[cfg(target_arch = "x86_64")]
113 use devices::HotPlugKey;
114 use devices::IommuDevType;
115 use devices::IrqEventIndex;
116 use devices::IrqEventSource;
117 #[cfg(feature = "pci-hotplug")]
118 use devices::NetResourceCarrier;
119 #[cfg(target_arch = "x86_64")]
120 use devices::PciAddress;
121 #[cfg(target_arch = "x86_64")]
122 use devices::PciBridge;
123 use devices::PciDevice;
124 #[cfg(target_arch = "x86_64")]
125 use devices::PciMmioMapper;
126 #[cfg(target_arch = "x86_64")]
127 use devices::PciRoot;
128 #[cfg(target_arch = "x86_64")]
129 use devices::PciRootCommand;
130 #[cfg(target_arch = "x86_64")]
131 use devices::PcieDownstreamPort;
132 #[cfg(target_arch = "x86_64")]
133 use devices::PcieHostPort;
134 #[cfg(target_arch = "x86_64")]
135 use devices::PcieRootPort;
136 #[cfg(target_arch = "x86_64")]
137 use devices::PcieUpstreamPort;
138 use devices::PvPanicCode;
139 use devices::PvPanicPciDevice;
140 #[cfg(feature = "pci-hotplug")]
141 use devices::ResourceCarrier;
142 use devices::StubPciDevice;
143 use devices::VirtioMmioDevice;
144 use devices::VirtioPciDevice;
145 #[cfg(feature = "usb")]
146 use devices::XhciController;
147 #[cfg(feature = "gpu")]
148 use gpu::*;
149 #[cfg(target_arch = "riscv64")]
150 use hypervisor::CpuConfigRiscv64;
151 #[cfg(target_arch = "x86_64")]
152 use hypervisor::CpuConfigX86_64;
153 use hypervisor::Hypervisor;
154 use hypervisor::HypervisorCap;
155 use hypervisor::MemCacheType;
156 use hypervisor::ProtectionType;
157 use hypervisor::Vm;
158 use hypervisor::VmCap;
159 use jail::*;
160 #[cfg(feature = "pci-hotplug")]
161 use jail_warden::JailWarden;
162 #[cfg(feature = "pci-hotplug")]
163 use jail_warden::JailWardenImpl;
164 #[cfg(feature = "pci-hotplug")]
165 use jail_warden::PermissiveJailWarden;
166 use libc;
167 use metrics::MetricsController;
168 use minijail::Minijail;
169 #[cfg(feature = "pci-hotplug")]
170 use pci_hotplug_manager::PciHotPlugManager;
171 use resources::AddressRange;
172 use resources::Alloc;
173 use resources::SystemAllocator;
174 #[cfg(target_arch = "riscv64")]
175 use riscv64::Riscv64 as Arch;
176 use rutabaga_gfx::RutabagaGralloc;
177 use rutabaga_gfx::RutabagaGrallocBackendFlags;
178 use smallvec::SmallVec;
179 #[cfg(feature = "swap")]
180 use swap::SwapController;
181 use sync::Condvar;
182 use sync::Mutex;
183 use vm_control::api::VmMemoryClient;
184 use vm_control::*;
185 use vm_memory::GuestAddress;
186 use vm_memory::GuestMemory;
187 use vm_memory::MemoryPolicy;
188 use vm_memory::MemoryRegionOptions;
189 #[cfg(target_arch = "x86_64")]
190 use x86_64::X8664arch as Arch;
191
192 use crate::crosvm::config::Config;
193 use crate::crosvm::config::Executable;
194 use crate::crosvm::config::FileBackedMappingParameters;
195 use crate::crosvm::config::HypervisorKind;
196 use crate::crosvm::config::InputDeviceOption;
197 use crate::crosvm::config::IrqChipKind;
198 use crate::crosvm::config::DEFAULT_TOUCH_DEVICE_HEIGHT;
199 use crate::crosvm::config::DEFAULT_TOUCH_DEVICE_WIDTH;
200 #[cfg(feature = "gdb")]
201 use crate::crosvm::gdb::gdb_thread;
202 #[cfg(feature = "gdb")]
203 use crate::crosvm::gdb::GdbStub;
204 #[cfg(target_arch = "x86_64")]
205 use crate::crosvm::ratelimit::Ratelimit;
206 use crate::crosvm::sys::cmdline::DevicesCommand;
207 use crate::crosvm::sys::config::SharedDir;
208 use crate::crosvm::sys::config::SharedDirKind;
209
210 const KVM_PATH: &str = "/dev/kvm";
211 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
212 #[cfg(feature = "geniezone")]
213 const GENIEZONE_PATH: &str = "/dev/gzvm";
214 #[cfg(all(any(target_arch = "arm", target_arch = "aarch64"), feature = "gunyah"))]
215 static GUNYAH_PATH: &str = "/dev/gunyah";
216
create_virtio_devices( cfg: &Config, vm: &mut impl Vm, resources: &mut SystemAllocator, #[cfg_attr(not(feature = "gpu"), allow(unused_variables))] vm_evt_wrtube: &SendTube, #[cfg(feature = "balloon")] balloon_device_tube: Option<Tube>, #[cfg(feature = "balloon")] balloon_inflate_tube: Option<Tube>, #[cfg(feature = "balloon")] init_balloon_size: u64, #[cfg(feature = "balloon")] dynamic_mapping_device_tube: Option<Tube>, disk_device_tubes: &mut Vec<Tube>, pmem_device_tubes: &mut Vec<Tube>, fs_device_tubes: &mut Vec<Tube>, #[cfg(feature = "gpu")] gpu_control_tube: Tube, #[cfg(feature = "gpu")] render_server_fd: Option<SafeDescriptor>, #[cfg(feature = "gpu")] has_vfio_gfx_device: bool, #[cfg(feature = "registered_events")] registered_evt_q: &SendTube, #[cfg(feature = "pvclock")] pvclock_device_tube: Option<Tube>, ) -> DeviceResult<Vec<VirtioDeviceStub>>217 fn create_virtio_devices(
218 cfg: &Config,
219 vm: &mut impl Vm,
220 resources: &mut SystemAllocator,
221 #[cfg_attr(not(feature = "gpu"), allow(unused_variables))] vm_evt_wrtube: &SendTube,
222 #[cfg(feature = "balloon")] balloon_device_tube: Option<Tube>,
223 #[cfg(feature = "balloon")] balloon_inflate_tube: Option<Tube>,
224 #[cfg(feature = "balloon")] init_balloon_size: u64,
225 #[cfg(feature = "balloon")] dynamic_mapping_device_tube: Option<Tube>,
226 disk_device_tubes: &mut Vec<Tube>,
227 pmem_device_tubes: &mut Vec<Tube>,
228 fs_device_tubes: &mut Vec<Tube>,
229 #[cfg(feature = "gpu")] gpu_control_tube: Tube,
230 #[cfg(feature = "gpu")] render_server_fd: Option<SafeDescriptor>,
231 #[cfg(feature = "gpu")] has_vfio_gfx_device: bool,
232 #[cfg(feature = "registered_events")] registered_evt_q: &SendTube,
233 #[cfg(feature = "pvclock")] pvclock_device_tube: Option<Tube>,
234 ) -> DeviceResult<Vec<VirtioDeviceStub>> {
235 let mut devs = Vec::new();
236
237 #[cfg(any(feature = "gpu", feature = "video-decoder", feature = "video-encoder"))]
238 let mut resource_bridges = Vec::<Tube>::new();
239
240 if !cfg.wayland_socket_paths.is_empty() {
241 #[cfg_attr(not(feature = "gpu"), allow(unused_mut))]
242 let mut wl_resource_bridge = None::<Tube>;
243
244 #[cfg(feature = "gpu")]
245 {
246 if cfg.gpu_parameters.is_some() {
247 let (wl_socket, gpu_socket) = Tube::pair().context("failed to create tube")?;
248 resource_bridges.push(gpu_socket);
249 wl_resource_bridge = Some(wl_socket);
250 }
251 }
252
253 devs.push(create_wayland_device(
254 cfg.protection_type,
255 &cfg.jail_config,
256 &cfg.wayland_socket_paths,
257 wl_resource_bridge,
258 )?);
259 }
260
261 #[cfg(feature = "video-decoder")]
262 let video_dec_cfg = cfg
263 .video_dec
264 .iter()
265 .map(|config| {
266 let (video_tube, gpu_tube) =
267 Tube::pair().expect("failed to create tube for video decoder");
268 resource_bridges.push(gpu_tube);
269 (video_tube, config.backend)
270 })
271 .collect::<Vec<_>>();
272
273 #[cfg(feature = "video-encoder")]
274 let video_enc_cfg = cfg
275 .video_enc
276 .iter()
277 .map(|config| {
278 let (video_tube, gpu_tube) =
279 Tube::pair().expect("failed to create tube for video encoder");
280 resource_bridges.push(gpu_tube);
281 (video_tube, config.backend)
282 })
283 .collect::<Vec<_>>();
284
285 #[cfg(feature = "gpu")]
286 {
287 if let Some(gpu_parameters) = &cfg.gpu_parameters {
288 let mut event_devices = Vec::new();
289 if cfg.display_window_mouse {
290 let display_param = if gpu_parameters.display_params.is_empty() {
291 Default::default()
292 } else {
293 gpu_parameters.display_params[0].clone()
294 };
295 let (gpu_display_w, gpu_display_h) = display_param.get_virtual_display_size();
296
297 let (event_device_socket, virtio_dev_socket) =
298 StreamChannel::pair(BlockingMode::Nonblocking, FramingMode::Byte)
299 .context("failed to create socket")?;
300 let mut multi_touch_width = gpu_display_w;
301 let mut multi_touch_height = gpu_display_h;
302 let mut multi_touch_name = None;
303 for input in &cfg.virtio_input {
304 if let InputDeviceOption::MultiTouch {
305 width,
306 height,
307 name,
308 ..
309 } = input
310 {
311 if let Some(width) = width {
312 multi_touch_width = *width;
313 }
314 if let Some(height) = height {
315 multi_touch_height = *height;
316 }
317 if let Some(name) = name {
318 multi_touch_name = Some(name.as_str());
319 }
320 break;
321 }
322 }
323 let dev = virtio::input::new_multi_touch(
324 // u32::MAX is the least likely to collide with the indices generated above for
325 // the multi_touch options, which begin at 0.
326 u32::MAX,
327 virtio_dev_socket,
328 multi_touch_width,
329 multi_touch_height,
330 multi_touch_name,
331 virtio::base_features(cfg.protection_type),
332 )
333 .context("failed to set up mouse device")?;
334 devs.push(VirtioDeviceStub {
335 dev: Box::new(dev),
336 jail: simple_jail(&cfg.jail_config, "input_device")?,
337 });
338 event_devices.push(EventDevice::touchscreen(event_device_socket));
339 }
340 if cfg.display_window_keyboard {
341 let (event_device_socket, virtio_dev_socket) =
342 StreamChannel::pair(BlockingMode::Nonblocking, FramingMode::Byte)
343 .context("failed to create socket")?;
344 let dev = virtio::input::new_keyboard(
345 // u32::MAX is the least likely to collide with the indices generated above for
346 // the multi_touch options, which begin at 0.
347 u32::MAX,
348 virtio_dev_socket,
349 virtio::base_features(cfg.protection_type),
350 )
351 .context("failed to set up keyboard device")?;
352 devs.push(VirtioDeviceStub {
353 dev: Box::new(dev),
354 jail: simple_jail(&cfg.jail_config, "input_device")?,
355 });
356 event_devices.push(EventDevice::keyboard(event_device_socket));
357 }
358
359 devs.push(create_gpu_device(
360 cfg,
361 vm_evt_wrtube,
362 gpu_control_tube,
363 resource_bridges,
364 render_server_fd,
365 has_vfio_gfx_device,
366 event_devices,
367 )?);
368 }
369 }
370
371 for (_, param) in cfg.serial_parameters.iter().filter(|(_k, v)| {
372 v.hardware == SerialHardware::VirtioConsole
373 || v.hardware == SerialHardware::LegacyVirtioConsole
374 }) {
375 let dev = param.create_virtio_device_and_jail(cfg.protection_type, &cfg.jail_config)?;
376 devs.push(dev);
377 }
378
379 for disk in &cfg.disks {
380 let disk_config = DiskConfig::new(disk, Some(disk_device_tubes.remove(0)));
381 devs.push(
382 disk_config.create_virtio_device_and_jail(cfg.protection_type, &cfg.jail_config)?,
383 );
384 }
385
386 if !cfg.scsis.is_empty() {
387 let scsi_config = ScsiConfig(&cfg.scsis);
388 devs.push(
389 scsi_config.create_virtio_device_and_jail(cfg.protection_type, &cfg.jail_config)?,
390 );
391 }
392
393 for (index, pmem_disk) in cfg.pmem_devices.iter().enumerate() {
394 let pmem_device_tube = pmem_device_tubes.remove(0);
395 devs.push(create_pmem_device(
396 cfg.protection_type,
397 &cfg.jail_config,
398 vm,
399 resources,
400 pmem_disk,
401 index,
402 pmem_device_tube,
403 )?);
404 }
405
406 if cfg.rng {
407 devs.push(create_rng_device(cfg.protection_type, &cfg.jail_config)?);
408 }
409
410 #[cfg(feature = "pvclock")]
411 if let Some(suspend_tube) = pvclock_device_tube {
412 let tsc_state = devices::tsc::tsc_state()?;
413 let tsc_sync_mitigations =
414 get_tsc_sync_mitigations(&tsc_state, cfg.vcpu_count.unwrap_or(1));
415 if tsc_state.core_grouping.size() > 1 {
416 // Host TSCs are not in sync. Log what mitigations are applied.
417 warn!(
418 "Host TSCs are not in sync, applying the following mitigations: {:?}",
419 tsc_sync_mitigations
420 );
421 }
422 devs.push(create_pvclock_device(
423 cfg.protection_type,
424 &cfg.jail_config,
425 tsc_state.frequency,
426 suspend_tube,
427 )?);
428 info!("virtio-pvclock is enabled for this vm");
429 }
430
431 #[cfg(feature = "vtpm")]
432 {
433 if cfg.vtpm_proxy {
434 devs.push(create_vtpm_proxy_device(
435 cfg.protection_type,
436 &cfg.jail_config,
437 )?);
438 }
439 }
440
441 let mut keyboard_idx = 0;
442 let mut mouse_idx = 0;
443 let mut rotary_idx = 0;
444 let mut switches_idx = 0;
445 let mut multi_touch_idx = 0;
446 let mut single_touch_idx = 0;
447 let mut trackpad_idx = 0;
448 for input in &cfg.virtio_input {
449 let input_dev = match input {
450 InputDeviceOption::Evdev { path } => {
451 create_vinput_device(cfg.protection_type, &cfg.jail_config, path.as_path())?
452 }
453 InputDeviceOption::Keyboard { path } => {
454 let dev = create_keyboard_device(
455 cfg.protection_type,
456 &cfg.jail_config,
457 path.as_path(),
458 keyboard_idx,
459 )?;
460 keyboard_idx += 1;
461 dev
462 }
463 InputDeviceOption::Mouse { path } => {
464 let dev = create_mouse_device(
465 cfg.protection_type,
466 &cfg.jail_config,
467 path.as_path(),
468 mouse_idx,
469 )?;
470 mouse_idx += 1;
471 dev
472 }
473 InputDeviceOption::MultiTouch {
474 path,
475 width,
476 height,
477 name,
478 } => {
479 let mut width = *width;
480 let mut height = *height;
481 if multi_touch_idx == 0 {
482 if width.is_none() {
483 width = cfg.display_input_width;
484 }
485 if height.is_none() {
486 height = cfg.display_input_height;
487 }
488 }
489 let dev = create_multi_touch_device(
490 cfg.protection_type,
491 &cfg.jail_config,
492 path.as_path(),
493 width.unwrap_or(DEFAULT_TOUCH_DEVICE_WIDTH),
494 height.unwrap_or(DEFAULT_TOUCH_DEVICE_HEIGHT),
495 name.as_deref(),
496 multi_touch_idx,
497 )?;
498 multi_touch_idx += 1;
499 dev
500 }
501 InputDeviceOption::Rotary { path } => {
502 let dev = create_rotary_device(
503 cfg.protection_type,
504 &cfg.jail_config,
505 path.as_path(),
506 rotary_idx,
507 )?;
508 rotary_idx += 1;
509 dev
510 }
511 InputDeviceOption::SingleTouch {
512 path,
513 width,
514 height,
515 name,
516 } => {
517 let mut width = *width;
518 let mut height = *height;
519 if single_touch_idx == 0 {
520 if width.is_none() {
521 width = cfg.display_input_width;
522 }
523 if height.is_none() {
524 height = cfg.display_input_height;
525 }
526 }
527 let dev = create_single_touch_device(
528 cfg.protection_type,
529 &cfg.jail_config,
530 path.as_path(),
531 width.unwrap_or(DEFAULT_TOUCH_DEVICE_WIDTH),
532 height.unwrap_or(DEFAULT_TOUCH_DEVICE_HEIGHT),
533 name.as_deref(),
534 single_touch_idx,
535 )?;
536 single_touch_idx += 1;
537 dev
538 }
539 InputDeviceOption::Switches { path } => {
540 let dev = create_switches_device(
541 cfg.protection_type,
542 &cfg.jail_config,
543 path.as_path(),
544 switches_idx,
545 )?;
546 switches_idx += 1;
547 dev
548 }
549 InputDeviceOption::Trackpad {
550 path,
551 width,
552 height,
553 name,
554 } => {
555 let dev = create_trackpad_device(
556 cfg.protection_type,
557 &cfg.jail_config,
558 path.as_path(),
559 width.unwrap_or(DEFAULT_TOUCH_DEVICE_WIDTH),
560 height.unwrap_or(DEFAULT_TOUCH_DEVICE_HEIGHT),
561 name.as_deref(),
562 trackpad_idx,
563 )?;
564 trackpad_idx += 1;
565 dev
566 }
567 };
568 devs.push(input_dev);
569 }
570
571 #[cfg(feature = "balloon")]
572 if let (Some(balloon_device_tube), Some(dynamic_mapping_device_tube)) =
573 (balloon_device_tube, dynamic_mapping_device_tube)
574 {
575 let balloon_features = (cfg.balloon_page_reporting as u64)
576 << BalloonFeatures::PageReporting as u64
577 | (cfg.balloon_ws_reporting as u64) << BalloonFeatures::WSReporting as u64;
578 devs.push(create_balloon_device(
579 cfg.protection_type,
580 &cfg.jail_config,
581 if cfg.strict_balloon {
582 BalloonMode::Strict
583 } else {
584 BalloonMode::Relaxed
585 },
586 balloon_device_tube,
587 balloon_inflate_tube,
588 init_balloon_size,
589 dynamic_mapping_device_tube,
590 balloon_features,
591 #[cfg(feature = "registered_events")]
592 Some(
593 registered_evt_q
594 .try_clone()
595 .context("failed to clone registered_evt_q tube")?,
596 ),
597 cfg.balloon_ws_num_bins,
598 )?);
599 }
600
601 #[cfg(feature = "net")]
602 for opt in &cfg.net {
603 let dev = opt.create_virtio_device_and_jail(cfg.protection_type, &cfg.jail_config)?;
604 devs.push(dev);
605 }
606
607 #[cfg(feature = "audio")]
608 {
609 for virtio_snd in &cfg.virtio_snds {
610 devs.push(create_virtio_snd_device(
611 cfg.protection_type,
612 &cfg.jail_config,
613 virtio_snd.clone(),
614 )?);
615 }
616 }
617
618 #[cfg(feature = "video-decoder")]
619 {
620 for (tube, backend) in video_dec_cfg {
621 register_video_device(
622 backend,
623 &mut devs,
624 tube,
625 cfg.protection_type,
626 &cfg.jail_config,
627 VideoDeviceType::Decoder,
628 )?;
629 }
630 }
631
632 #[cfg(feature = "video-encoder")]
633 {
634 for (tube, backend) in video_enc_cfg {
635 register_video_device(
636 backend,
637 &mut devs,
638 tube,
639 cfg.protection_type,
640 &cfg.jail_config,
641 VideoDeviceType::Encoder,
642 )?;
643 }
644 }
645
646 if let Some(vsock_config) = &cfg.vsock {
647 devs.push(
648 vsock_config.create_virtio_device_and_jail(cfg.protection_type, &cfg.jail_config)?,
649 );
650 }
651
652 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
653 {
654 if cfg.vhost_scmi {
655 devs.push(create_vhost_scmi_device(
656 cfg.protection_type,
657 &cfg.jail_config,
658 cfg.vhost_scmi_device.clone(),
659 )?);
660 }
661 }
662 for vhost_user_fs in &cfg.vhost_user_fs {
663 devs.push(create_vhost_user_fs_device(
664 cfg.protection_type,
665 vhost_user_fs,
666 )?);
667 }
668
669 for shared_dir in &cfg.shared_dirs {
670 let SharedDir {
671 src,
672 tag,
673 kind,
674 ugid,
675 uid_map,
676 gid_map,
677 fs_cfg,
678 p9_cfg,
679 } = shared_dir;
680
681 let dev = match kind {
682 SharedDirKind::FS => {
683 let device_tube = fs_device_tubes.remove(0);
684 create_fs_device(
685 cfg.protection_type,
686 &cfg.jail_config,
687 *ugid,
688 uid_map,
689 gid_map,
690 src,
691 tag,
692 fs_cfg.clone(),
693 device_tube,
694 )?
695 }
696 SharedDirKind::P9 => create_9p_device(
697 cfg.protection_type,
698 &cfg.jail_config,
699 *ugid,
700 uid_map,
701 gid_map,
702 src,
703 tag,
704 p9_cfg.clone(),
705 )?,
706 };
707 devs.push(dev);
708 }
709
710 #[cfg(feature = "audio")]
711 if let Some(path) = &cfg.sound {
712 devs.push(create_sound_device(
713 path,
714 cfg.protection_type,
715 &cfg.jail_config,
716 )?);
717 }
718
719 for opt in &cfg.vhost_user {
720 devs.push(create_vhost_user_frontend(cfg.protection_type, opt)?);
721 }
722
723 Ok(devs)
724 }
725
create_devices( cfg: &Config, vm: &mut impl Vm, resources: &mut SystemAllocator, vm_evt_wrtube: &SendTube, iommu_attached_endpoints: &mut BTreeMap<u32, Arc<Mutex<Box<dyn MemoryMapperTrait>>>>, irq_control_tubes: &mut Vec<Tube>, vm_memory_control_tubes: &mut Vec<VmMemoryTube>, control_tubes: &mut Vec<TaggedControlTube>, #[cfg(feature = "balloon")] balloon_device_tube: Option<Tube>, #[cfg(feature = "balloon")] init_balloon_size: u64, #[cfg(feature = "balloon")] dynamic_mapping_device_tube: Option<Tube>, disk_device_tubes: &mut Vec<Tube>, pmem_device_tubes: &mut Vec<Tube>, fs_device_tubes: &mut Vec<Tube>, #[cfg(feature = "usb")] usb_provider: DeviceProvider, #[cfg(feature = "gpu")] gpu_control_tube: Tube, #[cfg(feature = "gpu")] render_server_fd: Option<SafeDescriptor>, iova_max_addr: &mut Option<u64>, #[cfg(feature = "registered_events")] registered_evt_q: &SendTube, #[cfg(feature = "pvclock")] pvclock_device_tube: Option<Tube>, ) -> DeviceResult<Vec<(Box<dyn BusDeviceObj>, Option<Minijail>)>>726 fn create_devices(
727 cfg: &Config,
728 vm: &mut impl Vm,
729 resources: &mut SystemAllocator,
730 vm_evt_wrtube: &SendTube,
731 iommu_attached_endpoints: &mut BTreeMap<u32, Arc<Mutex<Box<dyn MemoryMapperTrait>>>>,
732 irq_control_tubes: &mut Vec<Tube>,
733 vm_memory_control_tubes: &mut Vec<VmMemoryTube>,
734 control_tubes: &mut Vec<TaggedControlTube>,
735 #[cfg(feature = "balloon")] balloon_device_tube: Option<Tube>,
736 #[cfg(feature = "balloon")] init_balloon_size: u64,
737 #[cfg(feature = "balloon")] dynamic_mapping_device_tube: Option<Tube>,
738 disk_device_tubes: &mut Vec<Tube>,
739 pmem_device_tubes: &mut Vec<Tube>,
740 fs_device_tubes: &mut Vec<Tube>,
741 #[cfg(feature = "usb")] usb_provider: DeviceProvider,
742 #[cfg(feature = "gpu")] gpu_control_tube: Tube,
743 #[cfg(feature = "gpu")] render_server_fd: Option<SafeDescriptor>,
744 iova_max_addr: &mut Option<u64>,
745 #[cfg(feature = "registered_events")] registered_evt_q: &SendTube,
746 #[cfg(feature = "pvclock")] pvclock_device_tube: Option<Tube>,
747 ) -> DeviceResult<Vec<(Box<dyn BusDeviceObj>, Option<Minijail>)>> {
748 let mut devices: Vec<(Box<dyn BusDeviceObj>, Option<Minijail>)> = Vec::new();
749 #[cfg(feature = "balloon")]
750 let mut balloon_inflate_tube: Option<Tube> = None;
751 #[cfg(feature = "gpu")]
752 let mut has_vfio_gfx_device = false;
753 if !cfg.vfio.is_empty() {
754 let mut coiommu_attached_endpoints = Vec::new();
755
756 for vfio_dev in &cfg.vfio {
757 let (dev, jail, viommu_mapper) = create_vfio_device(
758 &cfg.jail_config,
759 vm,
760 resources,
761 irq_control_tubes,
762 vm_memory_control_tubes,
763 control_tubes,
764 &vfio_dev.path,
765 false,
766 None,
767 vfio_dev.guest_address,
768 Some(&mut coiommu_attached_endpoints),
769 vfio_dev.iommu,
770 vfio_dev.dt_symbol.clone(),
771 )?;
772 match dev {
773 VfioDeviceVariant::Pci(vfio_pci_device) => {
774 *iova_max_addr = Some(max(
775 vfio_pci_device.get_max_iova(),
776 iova_max_addr.unwrap_or(0),
777 ));
778
779 #[cfg(feature = "gpu")]
780 if vfio_pci_device.is_gfx() {
781 has_vfio_gfx_device = true;
782 }
783
784 if let Some(viommu_mapper) = viommu_mapper {
785 iommu_attached_endpoints.insert(
786 vfio_pci_device
787 .pci_address()
788 .context("not initialized")?
789 .to_u32(),
790 Arc::new(Mutex::new(Box::new(viommu_mapper))),
791 );
792 }
793
794 devices.push((Box::new(vfio_pci_device), jail));
795 }
796 VfioDeviceVariant::Platform(vfio_plat_dev) => {
797 devices.push((Box::new(vfio_plat_dev), jail));
798 }
799 }
800 }
801
802 if !coiommu_attached_endpoints.is_empty() || !iommu_attached_endpoints.is_empty() {
803 let mut buf = mem::MaybeUninit::<libc::rlimit64>::zeroed();
804 // SAFETY: trivially safe
805 let res = unsafe { libc::getrlimit64(libc::RLIMIT_MEMLOCK, buf.as_mut_ptr()) };
806 if res == 0 {
807 // SAFETY: safe because getrlimit64 has returned success.
808 let limit = unsafe { buf.assume_init() };
809 let rlim_new = limit.rlim_cur.saturating_add(vm.get_memory().memory_size());
810 let rlim_max = max(limit.rlim_max, rlim_new);
811 if limit.rlim_cur < rlim_new {
812 let limit_arg = libc::rlimit64 {
813 rlim_cur: rlim_new,
814 rlim_max,
815 };
816 // SAFETY: trivially safe
817 let res = unsafe { libc::setrlimit64(libc::RLIMIT_MEMLOCK, &limit_arg) };
818 if res != 0 {
819 bail!("Set rlimit failed");
820 }
821 }
822 } else {
823 bail!("Get rlimit failed");
824 }
825 }
826 #[cfg(feature = "balloon")]
827 let coiommu_tube: Option<Tube>;
828 #[cfg(not(feature = "balloon"))]
829 let coiommu_tube: Option<Tube> = None;
830 if !coiommu_attached_endpoints.is_empty() {
831 let vfio_container =
832 VfioCommonSetup::vfio_get_container(IommuDevType::CoIommu, None as Option<&Path>)
833 .context("failed to get vfio container")?;
834 let (coiommu_host_tube, coiommu_device_tube) =
835 Tube::pair().context("failed to create coiommu tube")?;
836 vm_memory_control_tubes.push(VmMemoryTube {
837 tube: coiommu_host_tube,
838 expose_with_viommu: false,
839 });
840 let vcpu_count = cfg.vcpu_count.unwrap_or(1) as u64;
841 #[cfg(feature = "balloon")]
842 match Tube::pair() {
843 Ok((x, y)) => {
844 coiommu_tube = Some(x);
845 balloon_inflate_tube = Some(y);
846 }
847 Err(x) => return Err(x).context("failed to create coiommu tube"),
848 }
849 let dev = CoIommuDev::new(
850 vm.get_memory().clone(),
851 vfio_container,
852 VmMemoryClient::new(coiommu_device_tube),
853 coiommu_tube,
854 coiommu_attached_endpoints,
855 vcpu_count,
856 cfg.coiommu_param.unwrap_or_default(),
857 )
858 .context("failed to create coiommu device")?;
859
860 devices.push((
861 Box::new(dev),
862 simple_jail(&cfg.jail_config, "coiommu_device")?,
863 ));
864 }
865 }
866
867 let stubs = create_virtio_devices(
868 cfg,
869 vm,
870 resources,
871 vm_evt_wrtube,
872 #[cfg(feature = "balloon")]
873 balloon_device_tube,
874 #[cfg(feature = "balloon")]
875 balloon_inflate_tube,
876 #[cfg(feature = "balloon")]
877 init_balloon_size,
878 #[cfg(feature = "balloon")]
879 dynamic_mapping_device_tube,
880 disk_device_tubes,
881 pmem_device_tubes,
882 fs_device_tubes,
883 #[cfg(feature = "gpu")]
884 gpu_control_tube,
885 #[cfg(feature = "gpu")]
886 render_server_fd,
887 #[cfg(feature = "gpu")]
888 has_vfio_gfx_device,
889 #[cfg(feature = "registered_events")]
890 registered_evt_q,
891 #[cfg(feature = "pvclock")]
892 pvclock_device_tube,
893 )?;
894
895 for stub in stubs {
896 match stub.dev.transport_type() {
897 VirtioTransportType::Pci => {
898 let (msi_host_tube, msi_device_tube) =
899 Tube::pair().context("failed to create tube")?;
900 irq_control_tubes.push(msi_host_tube);
901
902 let shared_memory_tube = if stub.dev.get_shared_memory_region().is_some() {
903 let (host_tube, device_tube) =
904 Tube::pair().context("failed to create shared memory tube")?;
905 vm_memory_control_tubes.push(VmMemoryTube {
906 tube: host_tube,
907 expose_with_viommu: stub.dev.expose_shmem_descriptors_with_viommu(),
908 });
909 Some(device_tube)
910 } else {
911 None
912 };
913
914 let (ioevent_host_tube, ioevent_device_tube) =
915 Tube::pair().context("failed to create ioevent tube")?;
916 vm_memory_control_tubes.push(VmMemoryTube {
917 tube: ioevent_host_tube,
918 expose_with_viommu: false,
919 });
920
921 let (host_tube, device_tube) =
922 Tube::pair().context("failed to create device control tube")?;
923 control_tubes.push(TaggedControlTube::Vm(host_tube));
924
925 let dev = VirtioPciDevice::new(
926 vm.get_memory().clone(),
927 stub.dev,
928 msi_device_tube,
929 cfg.disable_virtio_intx,
930 shared_memory_tube.map(VmMemoryClient::new),
931 VmMemoryClient::new(ioevent_device_tube),
932 device_tube,
933 )
934 .context("failed to create virtio pci dev")?;
935
936 devices.push((Box::new(dev) as Box<dyn BusDeviceObj>, stub.jail));
937 }
938 VirtioTransportType::Mmio => {
939 let dev = VirtioMmioDevice::new(vm.get_memory().clone(), stub.dev, false)
940 .context("failed to create virtio mmio dev")?;
941 devices.push((Box::new(dev) as Box<dyn BusDeviceObj>, stub.jail));
942 }
943 }
944 }
945
946 #[cfg(feature = "usb")]
947 if cfg.usb {
948 // Create xhci controller.
949 let usb_controller = Box::new(XhciController::new(
950 vm.get_memory().clone(),
951 Box::new(usb_provider),
952 ));
953 devices.push((
954 usb_controller,
955 simple_jail(&cfg.jail_config, "xhci_device")?,
956 ));
957 }
958
959 for params in &cfg.stub_pci_devices {
960 // Stub devices don't need jailing since they don't do anything.
961 devices.push((Box::new(StubPciDevice::new(params)), None));
962 }
963
964 devices.push((
965 Box::new(PvPanicPciDevice::new(vm_evt_wrtube.try_clone()?)),
966 None,
967 ));
968
969 Ok(devices)
970 }
971
create_file_backed_mappings( cfg: &Config, vm: &mut impl Vm, resources: &mut SystemAllocator, ) -> Result<()>972 fn create_file_backed_mappings(
973 cfg: &Config,
974 vm: &mut impl Vm,
975 resources: &mut SystemAllocator,
976 ) -> Result<()> {
977 for mapping in &cfg.file_backed_mappings {
978 let file = OpenOptions::new()
979 .read(true)
980 .write(mapping.writable)
981 .custom_flags(if mapping.sync { libc::O_SYNC } else { 0 })
982 .open(&mapping.path)
983 .context("failed to open file for file-backed mapping")?;
984 let prot = if mapping.writable {
985 Protection::read_write()
986 } else {
987 Protection::read()
988 };
989 let size = mapping
990 .size
991 .try_into()
992 .context("Invalid size for file-backed mapping")?;
993 let memory_mapping = MemoryMappingBuilder::new(size)
994 .from_file(&file)
995 .offset(mapping.offset)
996 .protection(prot)
997 .build()
998 .context("failed to map backing file for file-backed mapping")?;
999
1000 let mapping_range = AddressRange::from_start_and_size(mapping.address, mapping.size)
1001 .context("failed to convert to AddressRange")?;
1002 match resources.mmio_allocator_any().allocate_at(
1003 mapping_range,
1004 Alloc::FileBacked(mapping.address),
1005 "file-backed mapping".to_owned(),
1006 ) {
1007 // OutOfSpace just means that this mapping is not in the MMIO regions at all, so don't
1008 // consider it an error.
1009 // TODO(b/222769529): Reserve this region in a global memory address space allocator
1010 // once we have that so nothing else can accidentally overlap with it.
1011 Ok(()) | Err(resources::Error::OutOfSpace) => {}
1012 e => e.context("failed to allocate guest address for file-backed mapping")?,
1013 }
1014
1015 vm.add_memory_region(
1016 GuestAddress(mapping.address),
1017 Box::new(memory_mapping),
1018 !mapping.writable,
1019 /* log_dirty_pages = */ false,
1020 MemCacheType::CacheCoherent,
1021 )
1022 .context("failed to configure file-backed mapping")?;
1023 }
1024
1025 Ok(())
1026 }
1027
1028 #[cfg(target_arch = "x86_64")]
1029 /// Collection of devices related to PCI hotplug.
1030 struct HotPlugStub {
1031 /// Map from bus index to hotplug bus.
1032 hotplug_buses: BTreeMap<u8, Arc<Mutex<dyn HotPlugBus>>>,
1033 /// Bus ranges of devices for virtio-iommu.
1034 iommu_bus_ranges: Vec<RangeInclusive<u32>>,
1035 /// Map from gpe index to GpeNotify devices.
1036 gpe_notify_devs: BTreeMap<u32, Arc<Mutex<dyn GpeNotify>>>,
1037 /// Map from bus index to GpeNotify devices.
1038 pme_notify_devs: BTreeMap<u8, Arc<Mutex<dyn PmeNotify>>>,
1039 }
1040
1041 #[cfg(target_arch = "x86_64")]
1042 impl HotPlugStub {
1043 /// Constructs empty HotPlugStub.
new() -> Self1044 fn new() -> Self {
1045 Self {
1046 hotplug_buses: BTreeMap::new(),
1047 iommu_bus_ranges: Vec::new(),
1048 gpe_notify_devs: BTreeMap::new(),
1049 pme_notify_devs: BTreeMap::new(),
1050 }
1051 }
1052 }
1053
1054 #[cfg(target_arch = "x86_64")]
1055 /// Creates PCIE root port with only virtual devices.
1056 ///
1057 /// user doesn't specify host pcie root port which link to this virtual pcie rp,
1058 /// find the empty bus and create a total virtual pcie rp
create_pure_virtual_pcie_root_port( sys_allocator: &mut SystemAllocator, irq_control_tubes: &mut Vec<Tube>, devices: &mut Vec<(Box<dyn BusDeviceObj>, Option<Minijail>)>, hp_bus_count: u8, ) -> Result<HotPlugStub>1059 fn create_pure_virtual_pcie_root_port(
1060 sys_allocator: &mut SystemAllocator,
1061 irq_control_tubes: &mut Vec<Tube>,
1062 devices: &mut Vec<(Box<dyn BusDeviceObj>, Option<Minijail>)>,
1063 hp_bus_count: u8,
1064 ) -> Result<HotPlugStub> {
1065 let mut hp_sec_buses = Vec::new();
1066 let mut hp_stub = HotPlugStub::new();
1067 // Create Pcie Root Port for non-root buses, each non-root bus device will be
1068 // connected behind a virtual pcie root port.
1069 for i in 1..255 {
1070 if sys_allocator.pci_bus_empty(i) {
1071 if hp_sec_buses.len() < hp_bus_count.into() {
1072 hp_sec_buses.push(i);
1073 }
1074 continue;
1075 }
1076 let pcie_root_port = Arc::new(Mutex::new(PcieRootPort::new(i, false)));
1077 hp_stub
1078 .pme_notify_devs
1079 .insert(i, pcie_root_port.clone() as Arc<Mutex<dyn PmeNotify>>);
1080 let (msi_host_tube, msi_device_tube) = Tube::pair().context("failed to create tube")?;
1081 irq_control_tubes.push(msi_host_tube);
1082 let pci_bridge = Box::new(PciBridge::new(pcie_root_port.clone(), msi_device_tube));
1083 // no ipc is used if the root port disables hotplug
1084 devices.push((pci_bridge, None));
1085 }
1086
1087 // Create Pcie Root Port for hot-plug
1088 if hp_sec_buses.len() < hp_bus_count.into() {
1089 return Err(anyhow!("no more addresses are available"));
1090 }
1091
1092 for hp_sec_bus in hp_sec_buses {
1093 let pcie_root_port = Arc::new(Mutex::new(PcieRootPort::new(hp_sec_bus, true)));
1094 hp_stub.pme_notify_devs.insert(
1095 hp_sec_bus,
1096 pcie_root_port.clone() as Arc<Mutex<dyn PmeNotify>>,
1097 );
1098 let (msi_host_tube, msi_device_tube) = Tube::pair().context("failed to create tube")?;
1099 irq_control_tubes.push(msi_host_tube);
1100 let pci_bridge = Box::new(PciBridge::new(pcie_root_port.clone(), msi_device_tube));
1101
1102 hp_stub.iommu_bus_ranges.push(RangeInclusive::new(
1103 PciAddress {
1104 bus: pci_bridge.get_secondary_num(),
1105 dev: 0,
1106 func: 0,
1107 }
1108 .to_u32(),
1109 PciAddress {
1110 bus: pci_bridge.get_subordinate_num(),
1111 dev: 32,
1112 func: 8,
1113 }
1114 .to_u32(),
1115 ));
1116
1117 devices.push((pci_bridge, None));
1118 hp_stub
1119 .hotplug_buses
1120 .insert(hp_sec_bus, pcie_root_port as Arc<Mutex<dyn HotPlugBus>>);
1121 }
1122 Ok(hp_stub)
1123 }
1124
setup_vm_components(cfg: &Config) -> Result<VmComponents>1125 fn setup_vm_components(cfg: &Config) -> Result<VmComponents> {
1126 let initrd_image = if let Some(initrd_path) = &cfg.initrd_path {
1127 Some(
1128 open_file_or_duplicate(initrd_path, OpenOptions::new().read(true))
1129 .with_context(|| format!("failed to open initrd {}", initrd_path.display()))?,
1130 )
1131 } else {
1132 None
1133 };
1134 let pvm_fw_image = if let Some(pvm_fw_path) = &cfg.pvm_fw {
1135 Some(
1136 open_file_or_duplicate(pvm_fw_path, OpenOptions::new().read(true))
1137 .with_context(|| format!("failed to open pvm_fw {}", pvm_fw_path.display()))?,
1138 )
1139 } else {
1140 None
1141 };
1142
1143 let vm_image = match cfg.executable_path {
1144 Some(Executable::Kernel(ref kernel_path)) => VmImage::Kernel(
1145 open_file_or_duplicate(kernel_path, OpenOptions::new().read(true)).with_context(
1146 || format!("failed to open kernel image {}", kernel_path.display()),
1147 )?,
1148 ),
1149 Some(Executable::Bios(ref bios_path)) => VmImage::Bios(
1150 open_file_or_duplicate(bios_path, OpenOptions::new().read(true))
1151 .with_context(|| format!("failed to open bios {}", bios_path.display()))?,
1152 ),
1153 _ => panic!("Did not receive a bios or kernel, should be impossible."),
1154 };
1155
1156 let swiotlb = if let Some(size) = cfg.swiotlb {
1157 Some(
1158 size.checked_mul(1024 * 1024)
1159 .ok_or_else(|| anyhow!("requested swiotlb size too large"))?,
1160 )
1161 } else if matches!(cfg.protection_type, ProtectionType::Unprotected) {
1162 None
1163 } else {
1164 Some(64 * 1024 * 1024)
1165 };
1166
1167 let (pflash_image, pflash_block_size) = if let Some(pflash_parameters) = &cfg.pflash_parameters
1168 {
1169 (
1170 Some(
1171 open_file_or_duplicate(
1172 &pflash_parameters.path,
1173 OpenOptions::new().read(true).write(true),
1174 )
1175 .with_context(|| {
1176 format!("failed to open pflash {}", pflash_parameters.path.display())
1177 })?,
1178 ),
1179 pflash_parameters.block_size,
1180 )
1181 } else {
1182 (None, 0)
1183 };
1184
1185 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
1186 let mut cpu_frequencies = BTreeMap::new();
1187 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
1188 let mut virt_cpufreq_socket = None;
1189
1190 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
1191 if cfg.virt_cpufreq {
1192 let host_cpu_frequencies = Arch::get_host_cpu_frequencies_khz()?;
1193
1194 for cpu_id in 0..cfg.vcpu_count.unwrap_or(1) {
1195 let vcpu_affinity = match cfg.vcpu_affinity.clone() {
1196 Some(VcpuAffinity::Global(v)) => v,
1197 Some(VcpuAffinity::PerVcpu(mut m)) => m.remove(&cpu_id).unwrap_or_default(),
1198 None => {
1199 panic!("There must be some vcpu_affinity setting with VirtCpufreq enabled!")
1200 }
1201 };
1202
1203 // Check that the physical CPUs that the vCPU is affined to all share the same
1204 // frequency domain.
1205 if let Some(freq_domain) = host_cpu_frequencies.get(&vcpu_affinity[0]) {
1206 for cpu in vcpu_affinity.iter() {
1207 if let Some(frequencies) = host_cpu_frequencies.get(cpu) {
1208 if frequencies != freq_domain {
1209 panic!("Affined CPUs do not share a frequency domain!");
1210 }
1211 }
1212 }
1213 cpu_frequencies.insert(cpu_id, freq_domain.clone());
1214 } else {
1215 panic!("No frequency domain for cpu:{}", cpu_id);
1216 }
1217 }
1218
1219 virt_cpufreq_socket = if let Some(path) = &cfg.virt_cpufreq_socket {
1220 let file = base::open_file_or_duplicate(path, OpenOptions::new().write(true))
1221 .with_context(|| {
1222 format!("failed to open virt_cpufreq_socket {}", path.display())
1223 })?;
1224 let fd: std::os::fd::OwnedFd = file.into();
1225 let socket: std::os::unix::net::UnixStream = fd.into();
1226 Some(socket)
1227 } else {
1228 None
1229 };
1230 }
1231
1232 // if --enable-fw-cfg or --fw-cfg was given, we want to enable fw_cfg
1233 let fw_cfg_enable = cfg.enable_fw_cfg || !cfg.fw_cfg_parameters.is_empty();
1234 let (cpu_clusters, cpu_capacity) = if cfg.host_cpu_topology {
1235 (
1236 Arch::get_host_cpu_clusters()?,
1237 Arch::get_host_cpu_capacity()?,
1238 )
1239 } else {
1240 (cfg.cpu_clusters.clone(), cfg.cpu_capacity.clone())
1241 };
1242
1243 Ok(VmComponents {
1244 #[cfg(target_arch = "x86_64")]
1245 ac_adapter: cfg.ac_adapter,
1246 #[cfg(target_arch = "x86_64")]
1247 break_linux_pci_config_io: cfg.break_linux_pci_config_io,
1248 memory_size: cfg
1249 .memory
1250 .unwrap_or(256)
1251 .checked_mul(1024 * 1024)
1252 .ok_or_else(|| anyhow!("requested memory size too large"))?,
1253 swiotlb,
1254 fw_cfg_enable,
1255 bootorder_fw_cfg_blob: Vec::new(),
1256 vcpu_count: cfg.vcpu_count.unwrap_or(1),
1257 vcpu_affinity: cfg.vcpu_affinity.clone(),
1258 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
1259 cpu_frequencies,
1260 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
1261 virt_cpufreq_socket,
1262 fw_cfg_parameters: cfg.fw_cfg_parameters.clone(),
1263 cpu_clusters,
1264 cpu_capacity,
1265 no_smt: cfg.no_smt,
1266 hugepages: cfg.hugepages,
1267 hv_cfg: hypervisor::Config {
1268 #[cfg(target_arch = "aarch64")]
1269 mte: cfg.mte,
1270 protection_type: cfg.protection_type,
1271 },
1272 vm_image,
1273 android_fstab: cfg
1274 .android_fstab
1275 .as_ref()
1276 .map(|x| {
1277 File::open(x)
1278 .with_context(|| format!("failed to open android fstab file {}", x.display()))
1279 })
1280 .map_or(Ok(None), |v| v.map(Some))?,
1281 pstore: cfg.pstore.clone(),
1282 pflash_block_size,
1283 pflash_image,
1284 initrd_image,
1285 extra_kernel_params: cfg.params.clone(),
1286 acpi_sdts: cfg
1287 .acpi_tables
1288 .iter()
1289 .map(|path| {
1290 SDT::from_file(path)
1291 .with_context(|| format!("failed to open ACPI file {}", path.display()))
1292 })
1293 .collect::<Result<Vec<SDT>>>()?,
1294 rt_cpus: cfg.rt_cpus.clone(),
1295 delay_rt: cfg.delay_rt,
1296 #[cfg(feature = "gdb")]
1297 gdb: None,
1298 no_i8042: cfg.no_i8042,
1299 no_rtc: cfg.no_rtc,
1300 #[cfg(target_arch = "x86_64")]
1301 smbios: cfg.smbios.clone(),
1302 host_cpu_topology: cfg.host_cpu_topology,
1303 itmt: cfg.itmt,
1304 #[cfg(target_arch = "x86_64")]
1305 force_s2idle: cfg.force_s2idle,
1306 pvm_fw: pvm_fw_image,
1307 #[cfg(target_arch = "x86_64")]
1308 pcie_ecam: cfg.pcie_ecam,
1309 #[cfg(target_arch = "x86_64")]
1310 pci_low_start: cfg.pci_low_start,
1311 dynamic_power_coefficient: cfg.dynamic_power_coefficient.clone(),
1312 boot_cpu: cfg.boot_cpu,
1313 })
1314 }
1315
1316 #[derive(Copy, Clone, Debug, Eq, PartialEq)]
1317 pub enum ExitState {
1318 Reset,
1319 Stop,
1320 Crash,
1321 GuestPanic,
1322 WatchdogReset,
1323 }
1324 // Remove ranges in `guest_mem_layout` that overlap with ranges in `file_backed_mappings`.
1325 // Returns the updated guest memory layout.
punch_holes_in_guest_mem_layout_for_mappings( guest_mem_layout: Vec<(GuestAddress, u64, MemoryRegionOptions)>, file_backed_mappings: &[FileBackedMappingParameters], ) -> Vec<(GuestAddress, u64, MemoryRegionOptions)>1326 fn punch_holes_in_guest_mem_layout_for_mappings(
1327 guest_mem_layout: Vec<(GuestAddress, u64, MemoryRegionOptions)>,
1328 file_backed_mappings: &[FileBackedMappingParameters],
1329 ) -> Vec<(GuestAddress, u64, MemoryRegionOptions)> {
1330 // Create a set containing (start, end) pairs with exclusive end (end = start + size; the byte
1331 // at end is not included in the range).
1332 let mut layout_set = BTreeSet::new();
1333 for (addr, size, options) in &guest_mem_layout {
1334 layout_set.insert((addr.offset(), addr.offset() + size, *options));
1335 }
1336
1337 for mapping in file_backed_mappings {
1338 let mapping_start = mapping.address;
1339 let mapping_end = mapping_start + mapping.size;
1340
1341 // Repeatedly split overlapping guest memory regions until no overlaps remain.
1342 while let Some((range_start, range_end, options)) = layout_set
1343 .iter()
1344 .find(|&&(range_start, range_end, _)| {
1345 mapping_start < range_end && mapping_end > range_start
1346 })
1347 .cloned()
1348 {
1349 layout_set.remove(&(range_start, range_end, options));
1350
1351 if range_start < mapping_start {
1352 layout_set.insert((range_start, mapping_start, options));
1353 }
1354 if range_end > mapping_end {
1355 layout_set.insert((mapping_end, range_end, options));
1356 }
1357 }
1358 }
1359
1360 // Build the final guest memory layout from the modified layout_set.
1361 layout_set
1362 .iter()
1363 .map(|(start, end, options)| (GuestAddress(*start), end - start, *options))
1364 .collect()
1365 }
1366
create_guest_memory( cfg: &Config, components: &VmComponents, hypervisor: &impl Hypervisor, ) -> Result<GuestMemory>1367 fn create_guest_memory(
1368 cfg: &Config,
1369 components: &VmComponents,
1370 hypervisor: &impl Hypervisor,
1371 ) -> Result<GuestMemory> {
1372 let guest_mem_layout = Arch::guest_memory_layout(components, hypervisor)
1373 .context("failed to create guest memory layout")?;
1374
1375 let guest_mem_layout =
1376 punch_holes_in_guest_mem_layout_for_mappings(guest_mem_layout, &cfg.file_backed_mappings);
1377
1378 let guest_mem = GuestMemory::new_with_options(&guest_mem_layout)
1379 .context("failed to create guest memory")?;
1380 let mut mem_policy = MemoryPolicy::empty();
1381 if components.hugepages {
1382 mem_policy |= MemoryPolicy::USE_HUGEPAGES;
1383 }
1384
1385 if cfg.lock_guest_memory {
1386 mem_policy |= MemoryPolicy::LOCK_GUEST_MEMORY;
1387 }
1388 guest_mem.set_memory_policy(mem_policy);
1389
1390 if cfg.unmap_guest_memory_on_fork {
1391 // Note that this isn't compatible with sandboxing. We could potentially fix that by
1392 // delaying the call until after the sandboxed devices are forked. However, the main use
1393 // for this is in conjunction with protected VMs, where most of the guest memory has been
1394 // unshared with the host. We'd need to be confident that the guest memory is unshared with
1395 // the host only after the `use_dontfork` call and those details will vary by hypervisor.
1396 // So, for now we keep things simple to be safe.
1397 guest_mem.use_dontfork().context("use_dontfork failed")?;
1398 }
1399
1400 Ok(guest_mem)
1401 }
1402
1403 #[cfg(all(target_arch = "aarch64", feature = "geniezone"))]
run_gz(device_path: Option<&Path>, cfg: Config, components: VmComponents) -> Result<ExitState>1404 fn run_gz(device_path: Option<&Path>, cfg: Config, components: VmComponents) -> Result<ExitState> {
1405 use devices::GeniezoneKernelIrqChip;
1406 use hypervisor::geniezone::Geniezone;
1407 use hypervisor::geniezone::GeniezoneVcpu;
1408 use hypervisor::geniezone::GeniezoneVm;
1409
1410 let device_path = device_path.unwrap_or(Path::new(GENIEZONE_PATH));
1411 let gzvm = Geniezone::new_with_path(device_path)
1412 .with_context(|| format!("failed to open GenieZone device {}", device_path.display()))?;
1413
1414 let guest_mem = create_guest_memory(&cfg, &components, &gzvm)?;
1415
1416 #[cfg(feature = "swap")]
1417 let swap_controller = if let Some(swap_dir) = cfg.swap_dir.as_ref() {
1418 Some(
1419 SwapController::launch(guest_mem.clone(), swap_dir, &cfg.jail_config)
1420 .context("launch vmm-swap monitor process")?,
1421 )
1422 } else {
1423 None
1424 };
1425
1426 let vm =
1427 GeniezoneVm::new(&gzvm, guest_mem, components.hv_cfg).context("failed to create vm")?;
1428
1429 // Check that the VM was actually created in protected mode as expected.
1430 if cfg.protection_type.isolates_memory() && !vm.check_capability(VmCap::Protected) {
1431 bail!("Failed to create protected VM");
1432 }
1433 let vm_clone = vm.try_clone().context("failed to clone vm")?;
1434
1435 let ioapic_host_tube;
1436 let mut irq_chip = match cfg.irq_chip.unwrap_or(IrqChipKind::Kernel) {
1437 IrqChipKind::Split => bail!("Geniezone does not support split irqchip mode"),
1438 IrqChipKind::Userspace => bail!("Geniezone does not support userspace irqchip mode"),
1439 IrqChipKind::Kernel => {
1440 ioapic_host_tube = None;
1441 GeniezoneKernelIrqChip::new(vm_clone, components.vcpu_count)
1442 .context("failed to create IRQ chip")?
1443 }
1444 };
1445
1446 run_vm::<GeniezoneVcpu, GeniezoneVm>(
1447 cfg,
1448 components,
1449 vm,
1450 &mut irq_chip,
1451 ioapic_host_tube,
1452 #[cfg(feature = "swap")]
1453 swap_controller,
1454 )
1455 }
1456
run_kvm(device_path: Option<&Path>, cfg: Config, components: VmComponents) -> Result<ExitState>1457 fn run_kvm(device_path: Option<&Path>, cfg: Config, components: VmComponents) -> Result<ExitState> {
1458 use devices::KvmKernelIrqChip;
1459 #[cfg(target_arch = "x86_64")]
1460 use devices::KvmSplitIrqChip;
1461 use hypervisor::kvm::Kvm;
1462 use hypervisor::kvm::KvmVcpu;
1463 use hypervisor::kvm::KvmVm;
1464
1465 let device_path = device_path.unwrap_or(Path::new(KVM_PATH));
1466 let kvm = Kvm::new_with_path(device_path)
1467 .with_context(|| format!("failed to open KVM device {}", device_path.display()))?;
1468
1469 let guest_mem = create_guest_memory(&cfg, &components, &kvm)?;
1470
1471 #[cfg(feature = "swap")]
1472 let swap_controller = if let Some(swap_dir) = cfg.swap_dir.as_ref() {
1473 Some(
1474 SwapController::launch(guest_mem.clone(), swap_dir, &cfg.jail_config)
1475 .context("launch vmm-swap monitor process")?,
1476 )
1477 } else {
1478 None
1479 };
1480
1481 let vm = KvmVm::new(&kvm, guest_mem, components.hv_cfg).context("failed to create vm")?;
1482
1483 #[cfg(target_arch = "x86_64")]
1484 if cfg.itmt {
1485 vm.set_platform_info_read_access(false)
1486 .context("failed to disable MSR_PLATFORM_INFO read access")?;
1487 }
1488
1489 // Check that the VM was actually created in protected mode as expected.
1490 if cfg.protection_type.isolates_memory() && !vm.check_capability(VmCap::Protected) {
1491 bail!("Failed to create protected VM");
1492 }
1493 let vm_clone = vm.try_clone().context("failed to clone vm")?;
1494
1495 enum KvmIrqChip {
1496 #[cfg(target_arch = "x86_64")]
1497 Split(KvmSplitIrqChip),
1498 Kernel(KvmKernelIrqChip),
1499 }
1500
1501 impl KvmIrqChip {
1502 fn as_mut(&mut self) -> &mut dyn IrqChipArch {
1503 match self {
1504 #[cfg(target_arch = "x86_64")]
1505 KvmIrqChip::Split(i) => i,
1506 KvmIrqChip::Kernel(i) => i,
1507 }
1508 }
1509 }
1510
1511 let ioapic_host_tube;
1512 let mut irq_chip = match cfg.irq_chip.unwrap_or(IrqChipKind::Kernel) {
1513 IrqChipKind::Userspace => {
1514 bail!("KVM userspace irqchip mode not implemented");
1515 }
1516 IrqChipKind::Split => {
1517 #[cfg(not(target_arch = "x86_64"))]
1518 bail!("KVM split irqchip mode only supported on x86 processors");
1519 #[cfg(target_arch = "x86_64")]
1520 {
1521 let (host_tube, ioapic_device_tube) =
1522 Tube::pair().context("failed to create tube")?;
1523 ioapic_host_tube = Some(host_tube);
1524 KvmIrqChip::Split(
1525 KvmSplitIrqChip::new(
1526 vm_clone,
1527 components.vcpu_count,
1528 ioapic_device_tube,
1529 Some(24),
1530 )
1531 .context("failed to create IRQ chip")?,
1532 )
1533 }
1534 }
1535 IrqChipKind::Kernel => {
1536 ioapic_host_tube = None;
1537 KvmIrqChip::Kernel(
1538 KvmKernelIrqChip::new(vm_clone, components.vcpu_count)
1539 .context("failed to create IRQ chip")?,
1540 )
1541 }
1542 };
1543
1544 run_vm::<KvmVcpu, KvmVm>(
1545 cfg,
1546 components,
1547 vm,
1548 irq_chip.as_mut(),
1549 ioapic_host_tube,
1550 #[cfg(feature = "swap")]
1551 swap_controller,
1552 )
1553 }
1554
1555 #[cfg(all(any(target_arch = "arm", target_arch = "aarch64"), feature = "gunyah"))]
run_gunyah( device_path: Option<&Path>, cfg: Config, components: VmComponents, ) -> Result<ExitState>1556 fn run_gunyah(
1557 device_path: Option<&Path>,
1558 cfg: Config,
1559 components: VmComponents,
1560 ) -> Result<ExitState> {
1561 use devices::GunyahIrqChip;
1562 use hypervisor::gunyah::Gunyah;
1563 use hypervisor::gunyah::GunyahVcpu;
1564 use hypervisor::gunyah::GunyahVm;
1565
1566 let device_path = device_path.unwrap_or(Path::new(GUNYAH_PATH));
1567 let gunyah = Gunyah::new_with_path(device_path)
1568 .with_context(|| format!("failed to open Gunyah device {}", device_path.display()))?;
1569
1570 let guest_mem = create_guest_memory(&cfg, &components, &gunyah)?;
1571
1572 #[cfg(feature = "swap")]
1573 let swap_controller = if let Some(swap_dir) = cfg.swap_dir.as_ref() {
1574 Some(
1575 SwapController::launch(guest_mem.clone(), swap_dir, &cfg.jail_config)
1576 .context("launch vmm-swap monitor process")?,
1577 )
1578 } else {
1579 None
1580 };
1581
1582 let vm = GunyahVm::new(&gunyah, guest_mem, components.hv_cfg).context("failed to create vm")?;
1583
1584 // Check that the VM was actually created in protected mode as expected.
1585 if cfg.protection_type.isolates_memory() && !vm.check_capability(VmCap::Protected) {
1586 bail!("Failed to create protected VM");
1587 }
1588
1589 let vm_clone = vm.try_clone()?;
1590
1591 run_vm::<GunyahVcpu, GunyahVm>(
1592 cfg,
1593 components,
1594 vm,
1595 &mut GunyahIrqChip::new(vm_clone)?,
1596 None,
1597 #[cfg(feature = "swap")]
1598 swap_controller,
1599 )
1600 }
1601
1602 /// Choose a default hypervisor if no `--hypervisor` option was specified.
get_default_hypervisor() -> Option<HypervisorKind>1603 fn get_default_hypervisor() -> Option<HypervisorKind> {
1604 let kvm_path = Path::new(KVM_PATH);
1605 if kvm_path.exists() {
1606 return Some(HypervisorKind::Kvm {
1607 device: Some(kvm_path.to_path_buf()),
1608 });
1609 }
1610
1611 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
1612 #[cfg(feature = "geniezone")]
1613 {
1614 let gz_path = Path::new(GENIEZONE_PATH);
1615 if gz_path.exists() {
1616 return Some(HypervisorKind::Geniezone {
1617 device: Some(gz_path.to_path_buf()),
1618 });
1619 }
1620 }
1621
1622 #[cfg(all(
1623 unix,
1624 any(target_arch = "arm", target_arch = "aarch64"),
1625 feature = "gunyah"
1626 ))]
1627 {
1628 let gunyah_path = Path::new(GUNYAH_PATH);
1629 if gunyah_path.exists() {
1630 return Some(HypervisorKind::Gunyah {
1631 device: Some(gunyah_path.to_path_buf()),
1632 });
1633 }
1634 }
1635
1636 None
1637 }
1638
run_config(cfg: Config) -> Result<ExitState>1639 pub fn run_config(cfg: Config) -> Result<ExitState> {
1640 if let Some(async_executor) = cfg.async_executor {
1641 Executor::set_default_executor_kind(async_executor)
1642 .context("Failed to set the default async executor")?;
1643 }
1644
1645 let components = setup_vm_components(&cfg)?;
1646
1647 let hypervisor = cfg
1648 .hypervisor
1649 .clone()
1650 .or_else(get_default_hypervisor)
1651 .context("no enabled hypervisor")?;
1652
1653 debug!("creating hypervisor: {:?}", hypervisor);
1654
1655 match hypervisor {
1656 HypervisorKind::Kvm { device } => run_kvm(device.as_deref(), cfg, components),
1657 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
1658 #[cfg(feature = "geniezone")]
1659 HypervisorKind::Geniezone { device } => run_gz(device.as_deref(), cfg, components),
1660 #[cfg(all(
1661 unix,
1662 any(target_arch = "arm", target_arch = "aarch64"),
1663 feature = "gunyah"
1664 ))]
1665 HypervisorKind::Gunyah { device } => run_gunyah(device.as_deref(), cfg, components),
1666 }
1667 }
1668
run_vm<Vcpu, V>( cfg: Config, #[allow(unused_mut)] mut components: VmComponents, mut vm: V, irq_chip: &mut dyn IrqChipArch, ioapic_host_tube: Option<Tube>, #[cfg(feature = "swap")] mut swap_controller: Option<SwapController>, ) -> Result<ExitState> where Vcpu: VcpuArch + 'static, V: VmArch + 'static,1669 fn run_vm<Vcpu, V>(
1670 cfg: Config,
1671 #[allow(unused_mut)] mut components: VmComponents,
1672 mut vm: V,
1673 irq_chip: &mut dyn IrqChipArch,
1674 ioapic_host_tube: Option<Tube>,
1675 #[cfg(feature = "swap")] mut swap_controller: Option<SwapController>,
1676 ) -> Result<ExitState>
1677 where
1678 Vcpu: VcpuArch + 'static,
1679 V: VmArch + 'static,
1680 {
1681 if cfg.jail_config.is_some() {
1682 // Printing something to the syslog before entering minijail so that libc's syslogger has a
1683 // chance to open files necessary for its operation, like `/etc/localtime`. After jailing,
1684 // access to those files will not be possible.
1685 info!("crosvm entering multiprocess mode");
1686 }
1687
1688 let (metrics_send, metrics_recv) = Tube::directional_pair().context("metrics tube")?;
1689 metrics::initialize(metrics_send);
1690
1691 #[cfg(all(feature = "pci-hotplug", feature = "swap"))]
1692 let swap_device_helper = match &swap_controller {
1693 Some(swap_controller) => Some(swap_controller.create_device_helper()?),
1694 None => None,
1695 };
1696 // pci-hotplug is only implemented for x86_64 for now, attempting to use it on other platform
1697 // would crash.
1698 #[cfg(all(feature = "pci-hotplug", not(target_arch = "x86_64")))]
1699 if cfg.pci_hotplug_slots.is_some() {
1700 bail!("pci-hotplug is not implemented for non x86_64 architecture");
1701 }
1702 // hotplug_manager must be created before vm is started since it forks jail warden process.
1703 #[cfg(feature = "pci-hotplug")]
1704 // TODO(293801301): Remove unused_mut after aarch64 support
1705 #[allow(unused_mut)]
1706 let mut hotplug_manager = if cfg.pci_hotplug_slots.is_some() {
1707 Some(PciHotPlugManager::new(
1708 vm.get_memory().clone(),
1709 &cfg,
1710 #[cfg(feature = "swap")]
1711 swap_device_helper,
1712 )?)
1713 } else {
1714 None
1715 };
1716
1717 #[cfg(feature = "gpu")]
1718 let (gpu_control_host_tube, gpu_control_device_tube) =
1719 Tube::pair().context("failed to create gpu tube")?;
1720
1721 #[cfg(feature = "usb")]
1722 let (usb_control_tube, usb_provider) =
1723 DeviceProvider::new().context("failed to create usb provider")?;
1724
1725 // Masking signals is inherently dangerous, since this can persist across clones/execs. Do this
1726 // before any jailed devices have been spawned, so that we can catch any of them that fail very
1727 // quickly.
1728 let sigchld_fd = SignalFd::new(libc::SIGCHLD).context("failed to create signalfd")?;
1729
1730 let control_server_socket = match &cfg.socket_path {
1731 Some(path) => Some(UnlinkUnixSeqpacketListener(
1732 UnixSeqpacketListener::bind(path).context("failed to create control server")?,
1733 )),
1734 None => None,
1735 };
1736
1737 let mut control_tubes = Vec::new();
1738 let mut irq_control_tubes = Vec::new();
1739 let mut vm_memory_control_tubes = Vec::new();
1740
1741 #[cfg(feature = "gdb")]
1742 if let Some(port) = cfg.gdb {
1743 // GDB needs a control socket to interrupt vcpus.
1744 let (gdb_host_tube, gdb_control_tube) = Tube::pair().context("failed to create tube")?;
1745 control_tubes.push(TaggedControlTube::Vm(gdb_host_tube));
1746 components.gdb = Some((port, gdb_control_tube));
1747 }
1748
1749 #[cfg(feature = "balloon")]
1750 let (balloon_host_tube, balloon_device_tube) = if cfg.balloon {
1751 if let Some(ref path) = cfg.balloon_control {
1752 (
1753 None,
1754 Some(Tube::new_from_unix_seqpacket(
1755 UnixSeqpacket::connect(path).with_context(|| {
1756 format!(
1757 "failed to connect to balloon control socket {}",
1758 path.display(),
1759 )
1760 })?,
1761 )?),
1762 )
1763 } else {
1764 // Balloon gets a special socket so balloon requests can be forwarded
1765 // from the main process.
1766 let (host, device) = Tube::pair().context("failed to create tube")?;
1767 (Some(host), Some(device))
1768 }
1769 } else {
1770 (None, None)
1771 };
1772
1773 // The balloon device also needs a tube to communicate back to the main process to
1774 // handle remapping memory dynamically.
1775 #[cfg(feature = "balloon")]
1776 let dynamic_mapping_device_tube = if cfg.balloon {
1777 let (dynamic_mapping_host_tube, dynamic_mapping_device_tube) =
1778 Tube::pair().context("failed to create tube")?;
1779 vm_memory_control_tubes.push(VmMemoryTube {
1780 tube: dynamic_mapping_host_tube,
1781 expose_with_viommu: false,
1782 });
1783 Some(dynamic_mapping_device_tube)
1784 } else {
1785 None
1786 };
1787
1788 // Create one control socket per disk.
1789 let mut disk_device_tubes = Vec::new();
1790 let mut disk_host_tubes = Vec::new();
1791 let disk_count = cfg.disks.len();
1792 for _ in 0..disk_count {
1793 let (disk_host_tub, disk_device_tube) = Tube::pair().context("failed to create tube")?;
1794 disk_host_tubes.push(disk_host_tub);
1795 disk_device_tubes.push(disk_device_tube);
1796 }
1797
1798 let mut pmem_device_tubes = Vec::new();
1799 let pmem_count = cfg.pmem_devices.len();
1800 for _ in 0..pmem_count {
1801 let (pmem_host_tube, pmem_device_tube) = Tube::pair().context("failed to create tube")?;
1802 pmem_device_tubes.push(pmem_device_tube);
1803 control_tubes.push(TaggedControlTube::VmMsync(pmem_host_tube));
1804 }
1805
1806 if let Some(ioapic_host_tube) = ioapic_host_tube {
1807 irq_control_tubes.push(ioapic_host_tube);
1808 }
1809
1810 let battery = if cfg.battery_config.is_some() {
1811 #[cfg_attr(
1812 not(feature = "power-monitor-powerd"),
1813 allow(clippy::manual_map, clippy::needless_match, unused_mut)
1814 )]
1815 let jail = if let Some(jail_config) = &cfg.jail_config {
1816 let mut config = SandboxConfig::new(jail_config, "battery");
1817 #[cfg(feature = "power-monitor-powerd")]
1818 {
1819 config.bind_mounts = true;
1820 }
1821 let mut jail =
1822 create_sandbox_minijail(&jail_config.pivot_root, MAX_OPEN_FILES_DEFAULT, &config)?;
1823
1824 // Setup a bind mount to the system D-Bus socket if the powerd monitor is used.
1825 #[cfg(feature = "power-monitor-powerd")]
1826 {
1827 let system_bus_socket_path = Path::new("/run/dbus/system_bus_socket");
1828 jail.mount_bind(system_bus_socket_path, system_bus_socket_path, true)?;
1829 }
1830 Some(jail)
1831 } else {
1832 None
1833 };
1834 (cfg.battery_config.as_ref().map(|c| c.type_), jail)
1835 } else {
1836 (cfg.battery_config.as_ref().map(|c| c.type_), None)
1837 };
1838
1839 let fs_count = cfg
1840 .shared_dirs
1841 .iter()
1842 .filter(|sd| sd.kind == SharedDirKind::FS)
1843 .count();
1844 let mut fs_device_tubes = Vec::with_capacity(fs_count);
1845 for _ in 0..fs_count {
1846 let (fs_host_tube, fs_device_tube) = Tube::pair().context("failed to create tube")?;
1847 control_tubes.push(TaggedControlTube::Fs(fs_host_tube));
1848 fs_device_tubes.push(fs_device_tube);
1849 }
1850
1851 let (vm_evt_wrtube, vm_evt_rdtube) =
1852 Tube::directional_pair().context("failed to create vm event tube")?;
1853
1854 let pstore_size = components.pstore.as_ref().map(|pstore| pstore.size as u64);
1855 let mut sys_allocator = SystemAllocator::new(
1856 Arch::get_system_allocator_config(&vm),
1857 pstore_size,
1858 &cfg.mmio_address_ranges,
1859 )
1860 .context("failed to create system allocator")?;
1861
1862 let ramoops_region = match &components.pstore {
1863 Some(pstore) => Some(
1864 arch::pstore::create_memory_region(
1865 &mut vm,
1866 sys_allocator.reserved_region().unwrap(),
1867 pstore,
1868 )
1869 .context("failed to allocate pstore region")?,
1870 ),
1871 None => None,
1872 };
1873
1874 create_file_backed_mappings(&cfg, &mut vm, &mut sys_allocator)?;
1875
1876 #[cfg(feature = "gpu")]
1877 // Hold on to the render server jail so it keeps running until we exit run_vm()
1878 let (_render_server_jail, render_server_fd) =
1879 if let Some(parameters) = &cfg.gpu_render_server_parameters {
1880 let (jail, fd) = start_gpu_render_server(&cfg, parameters)?;
1881 (Some(ScopedMinijail(jail)), Some(fd))
1882 } else {
1883 (None, None)
1884 };
1885
1886 #[cfg(feature = "balloon")]
1887 let init_balloon_size = components
1888 .memory_size
1889 .checked_sub(cfg.init_memory.map_or(components.memory_size, |m| {
1890 m.checked_mul(1024 * 1024).unwrap_or(u64::MAX)
1891 }))
1892 .context("failed to calculate init balloon size")?;
1893
1894 let mut iommu_attached_endpoints: BTreeMap<u32, Arc<Mutex<Box<dyn MemoryMapperTrait>>>> =
1895 BTreeMap::new();
1896 let mut iova_max_addr: Option<u64> = None;
1897
1898 // pvclock gets a tube for handling suspend/resume requests from the main thread.
1899 #[cfg(feature = "pvclock")]
1900 let (pvclock_host_tube, pvclock_device_tube) = if cfg.pvclock {
1901 let (host, device) = Tube::pair().context("failed to create tube")?;
1902 (Some(host), Some(device))
1903 } else {
1904 (None, None)
1905 };
1906 #[cfg(not(feature = "pvclock"))]
1907 if cfg.pvclock {
1908 bail!("pvclock device is only supported when crosvm is built with a feature 'pvclock'");
1909 }
1910
1911 #[cfg(feature = "registered_events")]
1912 let (reg_evt_wrtube, reg_evt_rdtube) =
1913 Tube::directional_pair().context("failed to create registered event tube")?;
1914
1915 let mut devices = create_devices(
1916 &cfg,
1917 &mut vm,
1918 &mut sys_allocator,
1919 &vm_evt_wrtube,
1920 &mut iommu_attached_endpoints,
1921 &mut irq_control_tubes,
1922 &mut vm_memory_control_tubes,
1923 &mut control_tubes,
1924 #[cfg(feature = "balloon")]
1925 balloon_device_tube,
1926 #[cfg(feature = "balloon")]
1927 init_balloon_size,
1928 #[cfg(feature = "balloon")]
1929 dynamic_mapping_device_tube,
1930 &mut disk_device_tubes,
1931 &mut pmem_device_tubes,
1932 &mut fs_device_tubes,
1933 #[cfg(feature = "usb")]
1934 usb_provider,
1935 #[cfg(feature = "gpu")]
1936 gpu_control_device_tube,
1937 #[cfg(feature = "gpu")]
1938 render_server_fd,
1939 &mut iova_max_addr,
1940 #[cfg(feature = "registered_events")]
1941 ®_evt_wrtube,
1942 #[cfg(feature = "pvclock")]
1943 pvclock_device_tube,
1944 )?;
1945
1946 #[cfg(feature = "pci-hotplug")]
1947 // TODO(293801301): Remove unused_variables after aarch64 support
1948 #[allow(unused_variables)]
1949 let pci_hotplug_slots = cfg.pci_hotplug_slots;
1950 #[cfg(not(feature = "pci-hotplug"))]
1951 #[allow(unused_variables)]
1952 let pci_hotplug_slots: Option<u8> = None;
1953 #[cfg(target_arch = "x86_64")]
1954 let hp_stub = create_pure_virtual_pcie_root_port(
1955 &mut sys_allocator,
1956 &mut irq_control_tubes,
1957 &mut devices,
1958 pci_hotplug_slots.unwrap_or(1),
1959 )?;
1960
1961 arch::assign_pci_addresses(&mut devices, &mut sys_allocator)?;
1962
1963 let pci_devices: Vec<&dyn PciDevice> = devices
1964 .iter()
1965 .filter_map(|d| (d.0).as_pci_device())
1966 .collect();
1967
1968 let virtio_devices: Vec<(&dyn VirtioDevice, devices::PciAddress)> = pci_devices
1969 .into_iter()
1970 .flat_map(|s| {
1971 if let Some(virtio_pci_device) = s.as_virtio_pci_device() {
1972 std::iter::zip(
1973 Some(virtio_pci_device.virtio_device()),
1974 virtio_pci_device.pci_address(),
1975 )
1976 .next()
1977 } else {
1978 None
1979 }
1980 })
1981 .collect();
1982
1983 let mut open_firmware_device_paths: Vec<(Vec<u8>, usize)> = virtio_devices
1984 .iter()
1985 .flat_map(|s| (s.0).bootorder_fw_cfg(s.1.dev))
1986 .collect();
1987
1988 // order the OpenFirmware device paths, in ascending order, by their boot_index
1989 open_firmware_device_paths.sort_by(|a, b| (a.1).cmp(&(b.1)));
1990
1991 // "/pci@iocf8/" is x86 specific and represents the root at the system bus port
1992 let mut bootorder_fw_cfg_blob =
1993 open_firmware_device_paths
1994 .into_iter()
1995 .fold(Vec::new(), |a, b| {
1996 a.into_iter()
1997 .chain("/pci@i0cf8/".as_bytes().iter().copied())
1998 .chain(b.0)
1999 .chain("\n".as_bytes().iter().copied())
2000 .collect()
2001 });
2002
2003 // the "bootorder" file is expected to end with a null terminator
2004 bootorder_fw_cfg_blob.push(0);
2005
2006 components.bootorder_fw_cfg_blob = bootorder_fw_cfg_blob;
2007
2008 // if the bootindex argument was given, we want to make sure that fw_cfg is enabled so the
2009 // "bootorder" file can be accessed by the guest.
2010 components.fw_cfg_enable |= components.bootorder_fw_cfg_blob.len() > 1;
2011
2012 let (translate_response_senders, request_rx) = setup_virtio_access_platform(
2013 &mut sys_allocator,
2014 &mut iommu_attached_endpoints,
2015 &mut devices,
2016 )?;
2017
2018 #[cfg(target_arch = "x86_64")]
2019 let iommu_bus_ranges = hp_stub.iommu_bus_ranges;
2020 #[cfg(not(target_arch = "x86_64"))]
2021 let iommu_bus_ranges = Vec::new();
2022
2023 let iommu_host_tube = if !iommu_attached_endpoints.is_empty()
2024 || (cfg.vfio_isolate_hotplug && !iommu_bus_ranges.is_empty())
2025 {
2026 let (iommu_host_tube, iommu_device_tube) = Tube::pair().context("failed to create tube")?;
2027 let iommu_dev = create_iommu_device(
2028 cfg.protection_type,
2029 &cfg.jail_config,
2030 iova_max_addr.unwrap_or(u64::MAX),
2031 iommu_attached_endpoints,
2032 iommu_bus_ranges,
2033 translate_response_senders,
2034 request_rx,
2035 iommu_device_tube,
2036 )?;
2037
2038 let (msi_host_tube, msi_device_tube) = Tube::pair().context("failed to create tube")?;
2039 irq_control_tubes.push(msi_host_tube);
2040 let (ioevent_host_tube, ioevent_device_tube) =
2041 Tube::pair().context("failed to create ioevent tube")?;
2042 vm_memory_control_tubes.push(VmMemoryTube {
2043 tube: ioevent_host_tube,
2044 expose_with_viommu: false,
2045 });
2046 let (host_tube, device_tube) =
2047 Tube::pair().context("failed to create device control tube")?;
2048 control_tubes.push(TaggedControlTube::Vm(host_tube));
2049 let mut dev = VirtioPciDevice::new(
2050 vm.get_memory().clone(),
2051 iommu_dev.dev,
2052 msi_device_tube,
2053 cfg.disable_virtio_intx,
2054 None,
2055 VmMemoryClient::new(ioevent_device_tube),
2056 device_tube,
2057 )
2058 .context("failed to create virtio pci dev")?;
2059 // early reservation for viommu.
2060 dev.allocate_address(&mut sys_allocator)
2061 .context("failed to allocate resources early for virtio pci dev")?;
2062 let dev = Box::new(dev);
2063 devices.push((dev, iommu_dev.jail));
2064 Some(iommu_host_tube)
2065 } else {
2066 None
2067 };
2068
2069 #[cfg(target_arch = "x86_64")]
2070 for device in devices
2071 .iter_mut()
2072 .filter_map(|(dev, _)| dev.as_pci_device_mut())
2073 {
2074 let sdts = device
2075 .generate_acpi(components.acpi_sdts)
2076 .or_else(|| {
2077 error!("ACPI table generation error");
2078 None
2079 })
2080 .ok_or_else(|| anyhow!("failed to generate ACPI table"))?;
2081 components.acpi_sdts = sdts;
2082 }
2083
2084 // KVM_CREATE_VCPU uses apic id for x86 and uses cpu id for others.
2085 let mut vcpu_ids = Vec::new();
2086
2087 let guest_suspended_cvar = if cfg.force_s2idle {
2088 Some(Arc::new((Mutex::new(false), Condvar::new())))
2089 } else {
2090 None
2091 };
2092
2093 let dt_overlays = cfg
2094 .device_tree_overlay
2095 .iter()
2096 .map(|o| {
2097 Ok(DtbOverlay {
2098 file: open_file_or_duplicate(o.path.as_path(), OpenOptions::new().read(true))
2099 .with_context(|| {
2100 format!("failed to open device tree overlay {}", o.path.display())
2101 })?,
2102 do_filter: o.filter_devs,
2103 })
2104 })
2105 .collect::<Result<Vec<DtbOverlay>>>()?;
2106
2107 let mut linux = Arch::build_vm::<V, Vcpu>(
2108 components,
2109 &vm_evt_wrtube,
2110 &mut sys_allocator,
2111 &cfg.serial_parameters,
2112 simple_jail(&cfg.jail_config, "serial_device")?,
2113 battery,
2114 vm,
2115 ramoops_region,
2116 devices,
2117 irq_chip,
2118 &mut vcpu_ids,
2119 cfg.dump_device_tree_blob.clone(),
2120 simple_jail(&cfg.jail_config, "serial_device")?,
2121 #[cfg(target_arch = "x86_64")]
2122 simple_jail(&cfg.jail_config, "block_device")?,
2123 #[cfg(target_arch = "x86_64")]
2124 simple_jail(&cfg.jail_config, "fw_cfg_device")?,
2125 #[cfg(feature = "swap")]
2126 &mut swap_controller,
2127 guest_suspended_cvar.clone(),
2128 dt_overlays,
2129 )
2130 .context("the architecture failed to build the vm")?;
2131
2132 if let Some(tube) = linux.vm_request_tube.take() {
2133 control_tubes.push(TaggedControlTube::Vm(tube));
2134 }
2135
2136 #[cfg(target_arch = "x86_64")]
2137 let (hp_control_tube, hp_worker_tube) = mpsc::channel();
2138 #[cfg(all(feature = "pci-hotplug", target_arch = "x86_64"))]
2139 if let Some(hotplug_manager) = &mut hotplug_manager {
2140 hotplug_manager.set_rootbus_controller(hp_control_tube.clone())?;
2141 }
2142 #[cfg(target_arch = "x86_64")]
2143 let hp_thread = {
2144 for (bus_num, hp_bus) in hp_stub.hotplug_buses.into_iter() {
2145 #[cfg(feature = "pci-hotplug")]
2146 if let Some(hotplug_manager) = &mut hotplug_manager {
2147 hotplug_manager.add_port(hp_bus)?;
2148 } else {
2149 linux.hotplug_bus.insert(bus_num, hp_bus);
2150 }
2151 #[cfg(not(feature = "pci-hotplug"))]
2152 linux.hotplug_bus.insert(bus_num, hp_bus);
2153 }
2154
2155 if let Some(pm) = &linux.pm {
2156 for (gpe, notify_dev) in hp_stub.gpe_notify_devs.into_iter() {
2157 pm.lock().register_gpe_notify_dev(gpe, notify_dev);
2158 }
2159 for (bus, notify_dev) in hp_stub.pme_notify_devs.into_iter() {
2160 pm.lock().register_pme_notify_dev(bus, notify_dev);
2161 }
2162 }
2163
2164 let (hp_vm_mem_host_tube, hp_vm_mem_worker_tube) =
2165 Tube::pair().context("failed to create tube")?;
2166 vm_memory_control_tubes.push(VmMemoryTube {
2167 tube: hp_vm_mem_host_tube,
2168 expose_with_viommu: false,
2169 });
2170
2171 let supports_readonly_mapping = linux.vm.supports_readonly_mapping();
2172 let pci_root = linux.root_config.clone();
2173 std::thread::Builder::new()
2174 .name("pci_root".to_string())
2175 .spawn(move || {
2176 start_pci_root_worker(
2177 supports_readonly_mapping,
2178 pci_root,
2179 hp_worker_tube,
2180 hp_vm_mem_worker_tube,
2181 )
2182 })?
2183 };
2184
2185 let flags = RutabagaGrallocBackendFlags::new().disable_vulkano();
2186 let gralloc = RutabagaGralloc::new(flags).context("failed to create gralloc")?;
2187
2188 run_control(
2189 linux,
2190 sys_allocator,
2191 cfg,
2192 control_server_socket,
2193 irq_control_tubes,
2194 vm_memory_control_tubes,
2195 control_tubes,
2196 #[cfg(feature = "balloon")]
2197 balloon_host_tube,
2198 &disk_host_tubes,
2199 #[cfg(feature = "gpu")]
2200 gpu_control_host_tube,
2201 #[cfg(feature = "usb")]
2202 usb_control_tube,
2203 vm_evt_rdtube,
2204 vm_evt_wrtube,
2205 sigchld_fd,
2206 gralloc,
2207 vcpu_ids,
2208 iommu_host_tube,
2209 #[cfg(target_arch = "x86_64")]
2210 hp_control_tube,
2211 #[cfg(target_arch = "x86_64")]
2212 hp_thread,
2213 #[cfg(feature = "pci-hotplug")]
2214 hotplug_manager,
2215 #[cfg(feature = "swap")]
2216 swap_controller,
2217 #[cfg(feature = "registered_events")]
2218 reg_evt_rdtube,
2219 guest_suspended_cvar,
2220 #[cfg(feature = "pvclock")]
2221 pvclock_host_tube,
2222 metrics_recv,
2223 )
2224 }
2225
2226 // Hotplug command is facing dead lock issue when it tries to acquire the lock
2227 // for pci root in the vm control thread. Dead lock could happen when the vm
2228 // control thread(Thread A namely) is handling the hotplug command and it tries
2229 // to get the lock for pci root. However, the lock is already hold by another
2230 // device in thread B, which is actively sending an vm control to be handled by
2231 // thread A and waiting for response. However, thread A is blocked on acquiring
2232 // the lock, so dead lock happens. In order to resolve this issue, we add this
2233 // worker thread and push all work that locks pci root to this thread.
2234 #[cfg(target_arch = "x86_64")]
start_pci_root_worker( supports_readonly_mapping: bool, pci_root: Arc<Mutex<PciRoot>>, hp_device_tube: mpsc::Receiver<PciRootCommand>, vm_control_tube: Tube, )2235 fn start_pci_root_worker(
2236 supports_readonly_mapping: bool,
2237 pci_root: Arc<Mutex<PciRoot>>,
2238 hp_device_tube: mpsc::Receiver<PciRootCommand>,
2239 vm_control_tube: Tube,
2240 ) {
2241 struct PciMmioMapperTube {
2242 supports_readonly_mapping: bool,
2243 vm_control_tube: Tube,
2244 registered_regions: BTreeMap<u32, VmMemoryRegionId>,
2245 next_id: u32,
2246 }
2247
2248 impl PciMmioMapper for PciMmioMapperTube {
2249 fn supports_readonly_mapping(&self) -> bool {
2250 self.supports_readonly_mapping
2251 }
2252
2253 fn add_mapping(&mut self, addr: GuestAddress, shmem: &SharedMemory) -> anyhow::Result<u32> {
2254 let shmem = shmem
2255 .try_clone()
2256 .context("failed to create new SharedMemory")?;
2257 self.vm_control_tube
2258 .send(&VmMemoryRequest::RegisterMemory {
2259 source: VmMemorySource::SharedMemory(shmem),
2260 dest: VmMemoryDestination::GuestPhysicalAddress(addr.0),
2261 prot: Protection::read(),
2262 cache: MemCacheType::CacheCoherent,
2263 })
2264 .context("failed to send request")?;
2265 match self.vm_control_tube.recv::<VmMemoryResponse>() {
2266 Ok(VmMemoryResponse::RegisterMemory(slot)) => {
2267 let cur_id = self.next_id;
2268 self.registered_regions.insert(cur_id, slot);
2269 self.next_id += 1;
2270 Ok(cur_id)
2271 }
2272 res => bail!("Bad response: {:?}", res),
2273 }
2274 }
2275 }
2276
2277 let mut mapper = PciMmioMapperTube {
2278 supports_readonly_mapping,
2279 vm_control_tube,
2280 registered_regions: BTreeMap::new(),
2281 next_id: 0,
2282 };
2283
2284 loop {
2285 match hp_device_tube.recv() {
2286 Ok(cmd) => match cmd {
2287 PciRootCommand::Add(addr, device) => {
2288 if let Err(e) = pci_root.lock().add_device(addr, device, &mut mapper) {
2289 error!("failed to add hotplugged device to PCI root port: {}", e);
2290 }
2291 }
2292 PciRootCommand::AddBridge(pci_bus) => {
2293 if let Err(e) = pci_root.lock().add_bridge(pci_bus) {
2294 error!("failed to add hotplugged bridge to PCI root port: {}", e);
2295 }
2296 }
2297 PciRootCommand::Remove(addr) => {
2298 pci_root.lock().remove_device(addr);
2299 }
2300 PciRootCommand::Kill => break,
2301 },
2302 Err(e) => {
2303 error!("Error: pci root worker channel closed: {}", e);
2304 break;
2305 }
2306 }
2307 }
2308 }
2309
2310 #[cfg(target_arch = "x86_64")]
get_hp_bus<V: VmArch, Vcpu: VcpuArch>( linux: &RunnableLinuxVm<V, Vcpu>, host_addr: PciAddress, ) -> Result<Arc<Mutex<dyn HotPlugBus>>>2311 fn get_hp_bus<V: VmArch, Vcpu: VcpuArch>(
2312 linux: &RunnableLinuxVm<V, Vcpu>,
2313 host_addr: PciAddress,
2314 ) -> Result<Arc<Mutex<dyn HotPlugBus>>> {
2315 for (_, hp_bus) in linux.hotplug_bus.iter() {
2316 if hp_bus.lock().is_match(host_addr).is_some() {
2317 return Ok(hp_bus.clone());
2318 }
2319 }
2320 Err(anyhow!("Failed to find a suitable hotplug bus"))
2321 }
2322
2323 #[cfg(target_arch = "x86_64")]
add_hotplug_device<V: VmArch, Vcpu: VcpuArch>( linux: &mut RunnableLinuxVm<V, Vcpu>, sys_allocator: &mut SystemAllocator, cfg: &Config, irq_control_tubes: &mut Vec<Tube>, vm_memory_control_tubes: &mut Vec<VmMemoryTube>, control_tubes: &mut Vec<TaggedControlTube>, hp_control_tube: &mpsc::Sender<PciRootCommand>, iommu_host_tube: Option<&Tube>, device: &HotPlugDeviceInfo, #[cfg(feature = "swap")] swap_controller: &mut Option<SwapController>, ) -> Result<()>2324 fn add_hotplug_device<V: VmArch, Vcpu: VcpuArch>(
2325 linux: &mut RunnableLinuxVm<V, Vcpu>,
2326 sys_allocator: &mut SystemAllocator,
2327 cfg: &Config,
2328 irq_control_tubes: &mut Vec<Tube>,
2329 vm_memory_control_tubes: &mut Vec<VmMemoryTube>,
2330 control_tubes: &mut Vec<TaggedControlTube>,
2331 hp_control_tube: &mpsc::Sender<PciRootCommand>,
2332 iommu_host_tube: Option<&Tube>,
2333 device: &HotPlugDeviceInfo,
2334 #[cfg(feature = "swap")] swap_controller: &mut Option<SwapController>,
2335 ) -> Result<()> {
2336 let host_addr = PciAddress::from_path(&device.path)
2337 .context("failed to parse hotplug device's PCI address")?;
2338 let hp_bus = get_hp_bus(linux, host_addr)?;
2339
2340 let (hotplug_key, pci_address) = match device.device_type {
2341 HotPlugDeviceType::UpstreamPort | HotPlugDeviceType::DownstreamPort => {
2342 let (vm_host_tube, vm_device_tube) = Tube::pair().context("failed to create tube")?;
2343 control_tubes.push(TaggedControlTube::Vm(vm_host_tube));
2344 let (msi_host_tube, msi_device_tube) = Tube::pair().context("failed to create tube")?;
2345 irq_control_tubes.push(msi_host_tube);
2346 let pcie_host = PcieHostPort::new(device.path.as_path(), vm_device_tube)?;
2347 let (hotplug_key, pci_bridge) = match device.device_type {
2348 HotPlugDeviceType::UpstreamPort => {
2349 let hotplug_key = HotPlugKey::HostUpstreamPort { host_addr };
2350 let pcie_upstream_port = Arc::new(Mutex::new(PcieUpstreamPort::new_from_host(
2351 pcie_host, true,
2352 )?));
2353 let pci_bridge =
2354 Box::new(PciBridge::new(pcie_upstream_port.clone(), msi_device_tube));
2355 linux
2356 .hotplug_bus
2357 .insert(pci_bridge.get_secondary_num(), pcie_upstream_port);
2358 (hotplug_key, pci_bridge)
2359 }
2360 HotPlugDeviceType::DownstreamPort => {
2361 let hotplug_key = HotPlugKey::HostDownstreamPort { host_addr };
2362 let pcie_downstream_port = Arc::new(Mutex::new(
2363 PcieDownstreamPort::new_from_host(pcie_host, true)?,
2364 ));
2365 let pci_bridge = Box::new(PciBridge::new(
2366 pcie_downstream_port.clone(),
2367 msi_device_tube,
2368 ));
2369 linux
2370 .hotplug_bus
2371 .insert(pci_bridge.get_secondary_num(), pcie_downstream_port);
2372 (hotplug_key, pci_bridge)
2373 }
2374 _ => {
2375 bail!("Impossible to reach here")
2376 }
2377 };
2378 let pci_address = Arch::register_pci_device(
2379 linux,
2380 pci_bridge,
2381 None,
2382 sys_allocator,
2383 hp_control_tube,
2384 #[cfg(feature = "swap")]
2385 swap_controller,
2386 )?;
2387
2388 (hotplug_key, pci_address)
2389 }
2390 HotPlugDeviceType::EndPoint => {
2391 let hotplug_key = HotPlugKey::HostVfio { host_addr };
2392 let (vfio_device, jail, viommu_mapper) = create_vfio_device(
2393 &cfg.jail_config,
2394 &linux.vm,
2395 sys_allocator,
2396 irq_control_tubes,
2397 vm_memory_control_tubes,
2398 control_tubes,
2399 &device.path,
2400 true,
2401 None,
2402 None,
2403 None,
2404 if iommu_host_tube.is_some() {
2405 IommuDevType::VirtioIommu
2406 } else {
2407 IommuDevType::NoIommu
2408 },
2409 None,
2410 )?;
2411 let vfio_pci_device = match vfio_device {
2412 VfioDeviceVariant::Pci(pci) => Box::new(pci),
2413 VfioDeviceVariant::Platform(_) => bail!("vfio platform hotplug not supported"),
2414 };
2415 let pci_address = Arch::register_pci_device(
2416 linux,
2417 vfio_pci_device,
2418 jail,
2419 sys_allocator,
2420 hp_control_tube,
2421 #[cfg(feature = "swap")]
2422 swap_controller,
2423 )?;
2424 if let Some(iommu_host_tube) = iommu_host_tube {
2425 let endpoint_addr = pci_address.to_u32();
2426 let vfio_wrapper = viommu_mapper.context("expected mapper")?;
2427 let descriptor = vfio_wrapper.clone_as_raw_descriptor()?;
2428 let request =
2429 VirtioIOMMURequest::VfioCommand(VirtioIOMMUVfioCommand::VfioDeviceAdd {
2430 endpoint_addr,
2431 wrapper_id: vfio_wrapper.id(),
2432 container: {
2433 // SAFETY:
2434 // Safe because the descriptor is uniquely owned by `descriptor`.
2435 unsafe { File::from_raw_descriptor(descriptor) }
2436 },
2437 });
2438 match virtio_iommu_request(iommu_host_tube, &request)
2439 .map_err(|_| VirtioIOMMUVfioError::SocketFailed)?
2440 {
2441 VirtioIOMMUResponse::VfioResponse(VirtioIOMMUVfioResult::Ok) => (),
2442 resp => bail!("Unexpected message response: {:?}", resp),
2443 }
2444 }
2445
2446 (hotplug_key, pci_address)
2447 }
2448 };
2449 hp_bus.lock().add_hotplug_device(hotplug_key, pci_address);
2450 if device.hp_interrupt {
2451 hp_bus.lock().hot_plug(pci_address)?;
2452 }
2453 Ok(())
2454 }
2455
2456 #[cfg(feature = "pci-hotplug")]
add_hotplug_net<V: VmArch, Vcpu: VcpuArch>( linux: &mut RunnableLinuxVm<V, Vcpu>, sys_allocator: &mut SystemAllocator, irq_control_tubes: &mut Vec<Tube>, vm_memory_control_tubes: &mut Vec<VmMemoryTube>, vm_control_tubes: &mut Vec<TaggedControlTube>, hotplug_manager: &mut PciHotPlugManager, net_param: NetParameters, ) -> Result<u8>2457 fn add_hotplug_net<V: VmArch, Vcpu: VcpuArch>(
2458 linux: &mut RunnableLinuxVm<V, Vcpu>,
2459 sys_allocator: &mut SystemAllocator,
2460 irq_control_tubes: &mut Vec<Tube>,
2461 vm_memory_control_tubes: &mut Vec<VmMemoryTube>,
2462 vm_control_tubes: &mut Vec<TaggedControlTube>,
2463 hotplug_manager: &mut PciHotPlugManager,
2464 net_param: NetParameters,
2465 ) -> Result<u8> {
2466 let (msi_host_tube, msi_device_tube) = Tube::pair().context("create tube")?;
2467 irq_control_tubes.push(msi_host_tube);
2468 let (ioevent_host_tube, ioevent_device_tube) = Tube::pair().context("create tube")?;
2469 let ioevent_vm_memory_client = VmMemoryClient::new(ioevent_device_tube);
2470 vm_memory_control_tubes.push(VmMemoryTube {
2471 tube: ioevent_host_tube,
2472 expose_with_viommu: false,
2473 });
2474 let (vm_control_host_tube, vm_control_device_tube) = Tube::pair().context("create tube")?;
2475 vm_control_tubes.push(TaggedControlTube::Vm(vm_control_host_tube));
2476 let net_carrier_device = NetResourceCarrier::new(
2477 net_param,
2478 msi_device_tube,
2479 ioevent_vm_memory_client,
2480 vm_control_device_tube,
2481 );
2482 hotplug_manager.hotplug_device(
2483 vec![ResourceCarrier::VirtioNet(net_carrier_device)],
2484 linux,
2485 sys_allocator,
2486 )
2487 }
2488
2489 #[cfg(feature = "pci-hotplug")]
handle_hotplug_net_command<V: VmArch, Vcpu: VcpuArch>( net_cmd: NetControlCommand, linux: &mut RunnableLinuxVm<V, Vcpu>, sys_allocator: &mut SystemAllocator, irq_control_tubes: &mut Vec<Tube>, vm_memory_control_tubes: &mut Vec<VmMemoryTube>, vm_control_tubes: &mut Vec<TaggedControlTube>, hotplug_manager: &mut PciHotPlugManager, ) -> VmResponse2490 fn handle_hotplug_net_command<V: VmArch, Vcpu: VcpuArch>(
2491 net_cmd: NetControlCommand,
2492 linux: &mut RunnableLinuxVm<V, Vcpu>,
2493 sys_allocator: &mut SystemAllocator,
2494 irq_control_tubes: &mut Vec<Tube>,
2495 vm_memory_control_tubes: &mut Vec<VmMemoryTube>,
2496 vm_control_tubes: &mut Vec<TaggedControlTube>,
2497 hotplug_manager: &mut PciHotPlugManager,
2498 ) -> VmResponse {
2499 match net_cmd {
2500 NetControlCommand::AddTap(tap_name) => handle_hotplug_net_add(
2501 linux,
2502 sys_allocator,
2503 irq_control_tubes,
2504 vm_memory_control_tubes,
2505 vm_control_tubes,
2506 hotplug_manager,
2507 &tap_name,
2508 ),
2509 NetControlCommand::RemoveTap(bus) => {
2510 handle_hotplug_net_remove(linux, sys_allocator, hotplug_manager, bus)
2511 }
2512 }
2513 }
2514
2515 #[cfg(feature = "pci-hotplug")]
handle_hotplug_net_add<V: VmArch, Vcpu: VcpuArch>( linux: &mut RunnableLinuxVm<V, Vcpu>, sys_allocator: &mut SystemAllocator, irq_control_tubes: &mut Vec<Tube>, vm_memory_control_tubes: &mut Vec<VmMemoryTube>, vm_control_tubes: &mut Vec<TaggedControlTube>, hotplug_manager: &mut PciHotPlugManager, tap_name: &str, ) -> VmResponse2516 fn handle_hotplug_net_add<V: VmArch, Vcpu: VcpuArch>(
2517 linux: &mut RunnableLinuxVm<V, Vcpu>,
2518 sys_allocator: &mut SystemAllocator,
2519 irq_control_tubes: &mut Vec<Tube>,
2520 vm_memory_control_tubes: &mut Vec<VmMemoryTube>,
2521 vm_control_tubes: &mut Vec<TaggedControlTube>,
2522 hotplug_manager: &mut PciHotPlugManager,
2523 tap_name: &str,
2524 ) -> VmResponse {
2525 let net_param_mode = NetParametersMode::TapName {
2526 tap_name: tap_name.to_owned(),
2527 mac: None,
2528 };
2529 let net_param = NetParameters {
2530 mode: net_param_mode,
2531 vhost_net: None,
2532 vq_pairs: None,
2533 packed_queue: false,
2534 pci_address: None,
2535 };
2536 let ret = add_hotplug_net(
2537 linux,
2538 sys_allocator,
2539 irq_control_tubes,
2540 vm_memory_control_tubes,
2541 vm_control_tubes,
2542 hotplug_manager,
2543 net_param,
2544 );
2545
2546 match ret {
2547 Ok(pci_bus) => VmResponse::PciHotPlugResponse { bus: pci_bus },
2548 Err(e) => VmResponse::ErrString(format!("{:?}", e)),
2549 }
2550 }
2551
2552 #[cfg(feature = "pci-hotplug")]
handle_hotplug_net_remove<V: VmArch, Vcpu: VcpuArch>( linux: &mut RunnableLinuxVm<V, Vcpu>, sys_allocator: &mut SystemAllocator, hotplug_manager: &mut PciHotPlugManager, bus: u8, ) -> VmResponse2553 fn handle_hotplug_net_remove<V: VmArch, Vcpu: VcpuArch>(
2554 linux: &mut RunnableLinuxVm<V, Vcpu>,
2555 sys_allocator: &mut SystemAllocator,
2556 hotplug_manager: &mut PciHotPlugManager,
2557 bus: u8,
2558 ) -> VmResponse {
2559 match hotplug_manager.remove_hotplug_device(bus, linux, sys_allocator) {
2560 Ok(_) => VmResponse::Ok,
2561 Err(e) => VmResponse::ErrString(format!("{:?}", e)),
2562 }
2563 }
2564
2565 #[cfg(target_arch = "x86_64")]
remove_hotplug_bridge<V: VmArch, Vcpu: VcpuArch>( linux: &RunnableLinuxVm<V, Vcpu>, sys_allocator: &mut SystemAllocator, buses_to_remove: &mut Vec<u8>, hotplug_key: HotPlugKey, child_bus: u8, ) -> Result<()>2566 fn remove_hotplug_bridge<V: VmArch, Vcpu: VcpuArch>(
2567 linux: &RunnableLinuxVm<V, Vcpu>,
2568 sys_allocator: &mut SystemAllocator,
2569 buses_to_remove: &mut Vec<u8>,
2570 hotplug_key: HotPlugKey,
2571 child_bus: u8,
2572 ) -> Result<()> {
2573 for (bus_num, hp_bus) in linux.hotplug_bus.iter() {
2574 let mut hp_bus_lock = hp_bus.lock();
2575 if let Some(pci_addr) = hp_bus_lock.get_hotplug_device(hotplug_key) {
2576 sys_allocator.release_pci(pci_addr.bus, pci_addr.dev, pci_addr.func);
2577 hp_bus_lock.hot_unplug(pci_addr)?;
2578 buses_to_remove.push(child_bus);
2579 if hp_bus_lock.is_empty() {
2580 if let Some(hotplug_key) = hp_bus_lock.get_hotplug_key() {
2581 remove_hotplug_bridge(
2582 linux,
2583 sys_allocator,
2584 buses_to_remove,
2585 hotplug_key,
2586 *bus_num,
2587 )?;
2588 }
2589 }
2590 return Ok(());
2591 }
2592 }
2593
2594 Err(anyhow!(
2595 "Can not find device {:?} on hotplug buses",
2596 hotplug_key
2597 ))
2598 }
2599
2600 #[cfg(target_arch = "x86_64")]
remove_hotplug_device<V: VmArch, Vcpu: VcpuArch>( linux: &mut RunnableLinuxVm<V, Vcpu>, sys_allocator: &mut SystemAllocator, iommu_host_tube: Option<&Tube>, device: &HotPlugDeviceInfo, ) -> Result<()>2601 fn remove_hotplug_device<V: VmArch, Vcpu: VcpuArch>(
2602 linux: &mut RunnableLinuxVm<V, Vcpu>,
2603 sys_allocator: &mut SystemAllocator,
2604 iommu_host_tube: Option<&Tube>,
2605 device: &HotPlugDeviceInfo,
2606 ) -> Result<()> {
2607 let host_addr = PciAddress::from_path(&device.path)?;
2608 let hotplug_key = match device.device_type {
2609 HotPlugDeviceType::UpstreamPort => HotPlugKey::HostUpstreamPort { host_addr },
2610 HotPlugDeviceType::DownstreamPort => HotPlugKey::HostDownstreamPort { host_addr },
2611 HotPlugDeviceType::EndPoint => HotPlugKey::HostVfio { host_addr },
2612 };
2613
2614 let hp_bus = linux
2615 .hotplug_bus
2616 .iter()
2617 .find(|(_, hp_bus)| {
2618 let hp_bus = hp_bus.lock();
2619 hp_bus.get_hotplug_device(hotplug_key).is_some()
2620 })
2621 .map(|(bus_num, hp_bus)| (*bus_num, hp_bus.clone()));
2622
2623 if let Some((bus_num, hp_bus)) = hp_bus {
2624 let mut buses_to_remove = Vec::new();
2625 let mut removed_key = None;
2626 let mut hp_bus_lock = hp_bus.lock();
2627 if let Some(pci_addr) = hp_bus_lock.get_hotplug_device(hotplug_key) {
2628 if let Some(iommu_host_tube) = iommu_host_tube {
2629 let request =
2630 VirtioIOMMURequest::VfioCommand(VirtioIOMMUVfioCommand::VfioDeviceDel {
2631 endpoint_addr: pci_addr.to_u32(),
2632 });
2633 match virtio_iommu_request(iommu_host_tube, &request)
2634 .map_err(|_| VirtioIOMMUVfioError::SocketFailed)?
2635 {
2636 VirtioIOMMUResponse::VfioResponse(VirtioIOMMUVfioResult::Ok) => (),
2637 resp => bail!("Unexpected message response: {:?}", resp),
2638 }
2639 }
2640 let mut empty_simbling = true;
2641 if let Some(HotPlugKey::HostDownstreamPort { host_addr }) =
2642 hp_bus_lock.get_hotplug_key()
2643 {
2644 let addr_alias = host_addr;
2645 for (simbling_bus_num, hp_bus) in linux.hotplug_bus.iter() {
2646 if *simbling_bus_num != bus_num {
2647 let hp_bus_lock = hp_bus.lock();
2648 let hotplug_key = hp_bus_lock.get_hotplug_key();
2649 if let Some(HotPlugKey::HostDownstreamPort { host_addr }) = hotplug_key {
2650 if addr_alias.bus == host_addr.bus && !hp_bus_lock.is_empty() {
2651 empty_simbling = false;
2652 break;
2653 }
2654 }
2655 }
2656 }
2657 }
2658
2659 // If all simbling downstream ports are empty, do not send hot unplug event for this
2660 // downstream port. Root port will send one plug out interrupt and remove all
2661 // the remaining devices
2662 if !empty_simbling {
2663 hp_bus_lock.hot_unplug(pci_addr)?;
2664 }
2665
2666 sys_allocator.release_pci(pci_addr.bus, pci_addr.dev, pci_addr.func);
2667 if empty_simbling || hp_bus_lock.is_empty() {
2668 if let Some(hotplug_key) = hp_bus_lock.get_hotplug_key() {
2669 removed_key = Some(hotplug_key);
2670 remove_hotplug_bridge(
2671 linux,
2672 sys_allocator,
2673 &mut buses_to_remove,
2674 hotplug_key,
2675 bus_num,
2676 )?;
2677 }
2678 }
2679 }
2680
2681 // Some types of TBT device has a few empty downstream ports. The emulated bridges
2682 // of these ports won't be removed since no vfio device is connected to our emulated
2683 // bridges. So we explicitly check all simbling bridges of the removed bridge here,
2684 // and remove them if bridge has no child device connected.
2685 if let Some(HotPlugKey::HostDownstreamPort { host_addr }) = removed_key {
2686 let addr_alias = host_addr;
2687 for (simbling_bus_num, hp_bus) in linux.hotplug_bus.iter() {
2688 if *simbling_bus_num != bus_num {
2689 let hp_bus_lock = hp_bus.lock();
2690 let hotplug_key = hp_bus_lock.get_hotplug_key();
2691 if let Some(HotPlugKey::HostDownstreamPort { host_addr }) = hotplug_key {
2692 if addr_alias.bus == host_addr.bus && hp_bus_lock.is_empty() {
2693 remove_hotplug_bridge(
2694 linux,
2695 sys_allocator,
2696 &mut buses_to_remove,
2697 hotplug_key.unwrap(),
2698 *simbling_bus_num,
2699 )?;
2700 }
2701 }
2702 }
2703 }
2704 }
2705 for bus in buses_to_remove.iter() {
2706 linux.hotplug_bus.remove(bus);
2707 }
2708 return Ok(());
2709 }
2710
2711 Err(anyhow!(
2712 "Can not find device {:?} on hotplug buses",
2713 hotplug_key
2714 ))
2715 }
2716
trigger_vm_suspend_and_wait_for_entry( guest_suspended_cvar: Arc<(Mutex<bool>, Condvar)>, tube: &SendTube, response: vm_control::VmResponse, suspend_evt: Event, pm: Option<Arc<Mutex<dyn PmResource + Send>>>, )2717 pub fn trigger_vm_suspend_and_wait_for_entry(
2718 guest_suspended_cvar: Arc<(Mutex<bool>, Condvar)>,
2719 tube: &SendTube,
2720 response: vm_control::VmResponse,
2721 suspend_evt: Event,
2722 pm: Option<Arc<Mutex<dyn PmResource + Send>>>,
2723 ) {
2724 let (lock, cvar) = &*guest_suspended_cvar;
2725 let mut guest_suspended = lock.lock();
2726
2727 *guest_suspended = false;
2728
2729 // During suspend also emulate sleepbtn, which allows to suspend VM (if running e.g. acpid and
2730 // reacts on sleep button events)
2731 if let Some(pm) = pm {
2732 pm.lock().slpbtn_evt();
2733 } else {
2734 error!("generating sleepbtn during suspend not supported");
2735 }
2736
2737 // Wait for notification about guest suspension, if not received after 15sec,
2738 // proceed anyway.
2739 let result = cvar.wait_timeout(guest_suspended, std::time::Duration::from_secs(15));
2740 guest_suspended = result.0;
2741
2742 if result.1.timed_out() {
2743 warn!("Guest suspension timeout - proceeding anyway");
2744 } else if *guest_suspended {
2745 info!("Guest suspended");
2746 }
2747
2748 if let Err(e) = suspend_evt.signal() {
2749 error!("failed to trigger suspend event: {}", e);
2750 }
2751 // Now we ready to send response over the tube and communicate that VM suspend has finished
2752 if let Err(e) = tube.send(&response) {
2753 error!("failed to send VmResponse: {}", e);
2754 }
2755 }
2756
2757 #[cfg(feature = "pvclock")]
send_pvclock_cmd(tube: &Tube, command: PvClockCommand) -> Result<()>2758 fn send_pvclock_cmd(tube: &Tube, command: PvClockCommand) -> Result<()> {
2759 tube.send(&command)
2760 .with_context(|| format!("failed to send pvclock command {:?}", command))?;
2761 let resp = tube
2762 .recv::<PvClockCommandResponse>()
2763 .context("failed to receive pvclock command response")?;
2764 if let PvClockCommandResponse::Err(e) = resp {
2765 bail!("pvclock encountered error on {:?}: {}", command, e);
2766 }
2767 if let PvClockCommandResponse::DeviceInactive = resp {
2768 warn!("Tried to send {command:?} but pvclock device was inactive");
2769 } else {
2770 info!("{command:?} completed with {resp:?}");
2771 }
2772 Ok(())
2773 }
2774
2775 #[cfg(target_arch = "x86_64")]
handle_hotplug_command<V: VmArch, Vcpu: VcpuArch>( linux: &mut RunnableLinuxVm<V, Vcpu>, sys_allocator: &mut SystemAllocator, cfg: &Config, add_irq_control_tubes: &mut Vec<Tube>, add_vm_memory_control_tubes: &mut Vec<VmMemoryTube>, add_tubes: &mut Vec<TaggedControlTube>, hp_control_tube: &mpsc::Sender<PciRootCommand>, iommu_host_tube: Option<&Tube>, device: &HotPlugDeviceInfo, add: bool, #[cfg(feature = "swap")] swap_controller: &mut Option<SwapController>, ) -> VmResponse2776 fn handle_hotplug_command<V: VmArch, Vcpu: VcpuArch>(
2777 linux: &mut RunnableLinuxVm<V, Vcpu>,
2778 sys_allocator: &mut SystemAllocator,
2779 cfg: &Config,
2780 add_irq_control_tubes: &mut Vec<Tube>,
2781 add_vm_memory_control_tubes: &mut Vec<VmMemoryTube>,
2782 add_tubes: &mut Vec<TaggedControlTube>,
2783 hp_control_tube: &mpsc::Sender<PciRootCommand>,
2784 iommu_host_tube: Option<&Tube>,
2785 device: &HotPlugDeviceInfo,
2786 add: bool,
2787 #[cfg(feature = "swap")] swap_controller: &mut Option<SwapController>,
2788 ) -> VmResponse {
2789 let iommu_host_tube = if cfg.vfio_isolate_hotplug {
2790 iommu_host_tube
2791 } else {
2792 None
2793 };
2794
2795 let ret = if add {
2796 add_hotplug_device(
2797 linux,
2798 sys_allocator,
2799 cfg,
2800 add_irq_control_tubes,
2801 add_vm_memory_control_tubes,
2802 add_tubes,
2803 hp_control_tube,
2804 iommu_host_tube,
2805 device,
2806 #[cfg(feature = "swap")]
2807 swap_controller,
2808 )
2809 } else {
2810 remove_hotplug_device(linux, sys_allocator, iommu_host_tube, device)
2811 };
2812
2813 match ret {
2814 Ok(()) => VmResponse::Ok,
2815 Err(e) => {
2816 error!("hanlde_hotplug_command failure: {}", e);
2817 add_tubes.clear();
2818 VmResponse::Err(base::Error::new(libc::EINVAL))
2819 }
2820 }
2821 }
2822
2823 struct ControlLoopState<'a, V: VmArch, Vcpu: VcpuArch> {
2824 linux: &'a mut RunnableLinuxVm<V, Vcpu>,
2825 cfg: &'a Config,
2826 sys_allocator: &'a Arc<Mutex<SystemAllocator>>,
2827 control_tubes: &'a BTreeMap<usize, TaggedControlTube>,
2828 disk_host_tubes: &'a [Tube],
2829 #[cfg(feature = "gpu")]
2830 gpu_control_tube: &'a Tube,
2831 #[cfg(feature = "usb")]
2832 usb_control_tube: &'a Tube,
2833 #[cfg(target_arch = "x86_64")]
2834 iommu_host_tube: &'a Option<Arc<Mutex<Tube>>>,
2835 #[cfg(target_arch = "x86_64")]
2836 hp_control_tube: &'a mpsc::Sender<PciRootCommand>,
2837 guest_suspended_cvar: &'a Option<Arc<(Mutex<bool>, Condvar)>>,
2838 #[cfg(feature = "pci-hotplug")]
2839 hotplug_manager: &'a mut Option<PciHotPlugManager>,
2840 #[cfg(feature = "swap")]
2841 swap_controller: &'a mut Option<SwapController>,
2842 vcpu_handles: &'a [(JoinHandle<()>, mpsc::Sender<vm_control::VcpuControl>)],
2843 #[cfg(feature = "balloon")]
2844 balloon_tube: Option<&'a mut BalloonTube>,
2845 device_ctrl_tube: &'a Tube,
2846 irq_handler_control: &'a Tube,
2847 #[cfg(any(target_arch = "x86_64", feature = "pci-hotplug"))]
2848 vm_memory_handler_control: &'a Tube,
2849 #[cfg(feature = "registered_events")]
2850 registered_evt_tubes: &'a mut HashMap<RegisteredEvent, HashSet<AddressedProtoTube>>,
2851 #[cfg(feature = "pvclock")]
2852 pvclock_host_tube: Option<Arc<Tube>>,
2853 }
2854
process_vm_request<V: VmArch + 'static, Vcpu: VcpuArch + 'static>( state: &mut ControlLoopState<V, Vcpu>, id: usize, tube: &Tube, request: VmRequest, #[cfg_attr( not(any(target_arch = "x86_64", feature = "pci-hotplug")), allow(unused_variables, clippy::ptr_arg) )] add_tubes: &mut Vec<TaggedControlTube>, ) -> Result<(Option<VmResponse>, bool, Option<VmRunMode>)>2855 fn process_vm_request<V: VmArch + 'static, Vcpu: VcpuArch + 'static>(
2856 state: &mut ControlLoopState<V, Vcpu>,
2857 id: usize,
2858 tube: &Tube,
2859 request: VmRequest,
2860 #[cfg_attr(
2861 not(any(target_arch = "x86_64", feature = "pci-hotplug")),
2862 allow(unused_variables, clippy::ptr_arg)
2863 )]
2864 add_tubes: &mut Vec<TaggedControlTube>,
2865 ) -> Result<(Option<VmResponse>, bool, Option<VmRunMode>)> {
2866 let mut suspend_requested = false;
2867 let mut run_mode_opt = None;
2868
2869 #[cfg(any(target_arch = "x86_64", feature = "pci-hotplug"))]
2870 let mut add_irq_control_tubes = Vec::new();
2871 #[cfg(any(target_arch = "x86_64", feature = "pci-hotplug"))]
2872 let mut add_vm_memory_control_tubes = Vec::new();
2873
2874 let response = match request {
2875 VmRequest::HotPlugVfioCommand { device, add } => {
2876 #[cfg(target_arch = "x86_64")]
2877 {
2878 handle_hotplug_command(
2879 state.linux,
2880 &mut state.sys_allocator.lock(),
2881 state.cfg,
2882 &mut add_irq_control_tubes,
2883 &mut add_vm_memory_control_tubes,
2884 add_tubes,
2885 state.hp_control_tube,
2886 state.iommu_host_tube.as_ref().map(|t| t.lock()).as_deref(),
2887 &device,
2888 add,
2889 #[cfg(feature = "swap")]
2890 state.swap_controller,
2891 )
2892 }
2893
2894 #[cfg(not(target_arch = "x86_64"))]
2895 {
2896 // Suppress warnings.
2897 let _ = (device, add);
2898 VmResponse::Ok
2899 }
2900 }
2901 #[cfg(feature = "pci-hotplug")]
2902 VmRequest::HotPlugNetCommand(net_cmd) => {
2903 if let Some(hotplug_manager) = state.hotplug_manager.as_mut() {
2904 handle_hotplug_net_command(
2905 net_cmd,
2906 state.linux,
2907 &mut state.sys_allocator.lock(),
2908 &mut add_irq_control_tubes,
2909 &mut add_vm_memory_control_tubes,
2910 add_tubes,
2911 hotplug_manager,
2912 )
2913 } else {
2914 VmResponse::ErrString("PCI hotplug is not enabled.".to_owned())
2915 }
2916 }
2917 #[cfg(feature = "registered_events")]
2918 VmRequest::RegisterListener { socket_addr, event } => {
2919 let (registered_tube, already_registered) =
2920 find_registered_tube(state.registered_evt_tubes, &socket_addr, event);
2921
2922 if !already_registered {
2923 let addr_tube = make_addr_tube_from_maybe_existing(registered_tube, socket_addr)?;
2924
2925 if let Some(tubes) = state.registered_evt_tubes.get_mut(&event) {
2926 tubes.insert(addr_tube);
2927 } else {
2928 state
2929 .registered_evt_tubes
2930 .insert(event, vec![addr_tube].into_iter().collect());
2931 }
2932 }
2933 VmResponse::Ok
2934 }
2935 #[cfg(feature = "registered_events")]
2936 VmRequest::UnregisterListener { socket_addr, event } => {
2937 if let Some(tubes) = state.registered_evt_tubes.get_mut(&event) {
2938 tubes.retain(|t| t.socket_addr != socket_addr);
2939 }
2940 state
2941 .registered_evt_tubes
2942 .retain(|_, tubes| !tubes.is_empty());
2943 VmResponse::Ok
2944 }
2945 #[cfg(feature = "registered_events")]
2946 VmRequest::Unregister { socket_addr } => {
2947 for (_, tubes) in state.registered_evt_tubes.iter_mut() {
2948 tubes.retain(|t| t.socket_addr != socket_addr);
2949 }
2950 state
2951 .registered_evt_tubes
2952 .retain(|_, tubes| !tubes.is_empty());
2953 VmResponse::Ok
2954 }
2955 #[cfg(feature = "balloon")]
2956 VmRequest::BalloonCommand(cmd) => {
2957 if let Some(tube) = state.balloon_tube.as_mut() {
2958 let Some((r, key)) = tube.send_cmd(cmd, Some(id)) else {
2959 return Ok((None, false, None));
2960 };
2961 if key != id {
2962 let Some(TaggedControlTube::Vm(tube)) = state.control_tubes.get(&key) else {
2963 return Ok((None, false, None));
2964 };
2965 if let Err(e) = tube.send(&r) {
2966 error!("failed to send VmResponse: {}", e);
2967 }
2968 return Ok((None, false, None));
2969 }
2970 r
2971 } else {
2972 VmResponse::Err(base::Error::new(libc::ENOTSUP))
2973 }
2974 }
2975 _ => {
2976 let response = request.execute(
2977 &state.linux.vm,
2978 &mut run_mode_opt,
2979 state.disk_host_tubes,
2980 &mut state.linux.pm,
2981 #[cfg(feature = "gpu")]
2982 Some(state.gpu_control_tube),
2983 #[cfg(not(feature = "gpu"))]
2984 None,
2985 #[cfg(feature = "usb")]
2986 Some(state.usb_control_tube),
2987 #[cfg(not(feature = "usb"))]
2988 None,
2989 &mut state.linux.bat_control,
2990 |msg| {
2991 vcpu::kick_all_vcpus(
2992 state.vcpu_handles,
2993 state.linux.irq_chip.as_irq_chip(),
2994 msg,
2995 )
2996 },
2997 state.cfg.force_s2idle,
2998 #[cfg(feature = "swap")]
2999 state.swap_controller.as_ref(),
3000 state.device_ctrl_tube,
3001 state.vcpu_handles.len(),
3002 state.irq_handler_control,
3003 || state.linux.irq_chip.snapshot(state.linux.vcpu_count),
3004 );
3005 if state.cfg.force_s2idle {
3006 if let VmRequest::SuspendVcpus = request {
3007 suspend_requested = true;
3008
3009 // Spawn s2idle wait thread.
3010 let send_tube = tube.try_clone_send_tube().unwrap();
3011 let suspend_evt = state.linux.suspend_evt.try_clone().unwrap();
3012 let guest_suspended_cvar = state.guest_suspended_cvar.clone();
3013 let delayed_response = response.clone();
3014 let pm = state.linux.pm.clone();
3015
3016 std::thread::Builder::new()
3017 .name("s2idle_wait".to_owned())
3018 .spawn(move || {
3019 trigger_vm_suspend_and_wait_for_entry(
3020 guest_suspended_cvar.unwrap(),
3021 &send_tube,
3022 delayed_response,
3023 suspend_evt,
3024 pm,
3025 )
3026 })
3027 .context("failed to spawn s2idle_wait thread")?;
3028 }
3029 } else {
3030 // if not doing s2idle, the guest clock should
3031 // behave as the host does, so let the guest
3032 // know about the suspend / resume via
3033 // virtio-pvclock.
3034 #[cfg(feature = "pvclock")]
3035 if let Some(ref pvclock_host_tube) = state.pvclock_host_tube {
3036 let cmd = match request {
3037 VmRequest::SuspendVcpus => Some(PvClockCommand::Suspend),
3038 VmRequest::ResumeVcpus => Some(PvClockCommand::Resume),
3039 _ => None,
3040 };
3041 if let Some(cmd) = cmd {
3042 if let Err(e) = send_pvclock_cmd(pvclock_host_tube, cmd.clone()) {
3043 error!("{:?} command failed: {:#}", cmd, e);
3044 } else {
3045 info!("{:?} command successfully processed", cmd);
3046 }
3047 }
3048 }
3049 }
3050 response
3051 }
3052 };
3053
3054 cfg_if::cfg_if! {
3055 if #[cfg(any(target_arch = "x86_64", feature = "pci-hotplug"))] {
3056 if !add_irq_control_tubes.is_empty() {
3057 state
3058 .irq_handler_control
3059 .send(&IrqHandlerRequest::AddIrqControlTubes(
3060 add_irq_control_tubes,
3061 ))?;
3062 }
3063 if !add_vm_memory_control_tubes.is_empty() {
3064 state
3065 .vm_memory_handler_control
3066 .send(&VmMemoryHandlerRequest::AddControlTubes(
3067 add_vm_memory_control_tubes,
3068 ))?;
3069 }
3070 }
3071 }
3072
3073 Ok((Some(response), suspend_requested, run_mode_opt))
3074 }
3075
process_vm_control_event<V: VmArch + 'static, Vcpu: VcpuArch + 'static>( state: &mut ControlLoopState<V, Vcpu>, id: usize, socket: &TaggedControlTube, ) -> Result<(bool, Vec<usize>, Vec<TaggedControlTube>)>3076 fn process_vm_control_event<V: VmArch + 'static, Vcpu: VcpuArch + 'static>(
3077 state: &mut ControlLoopState<V, Vcpu>,
3078 id: usize,
3079 socket: &TaggedControlTube,
3080 ) -> Result<(bool, Vec<usize>, Vec<TaggedControlTube>)> {
3081 let mut vm_control_ids_to_remove = Vec::new();
3082 let mut add_tubes = Vec::new();
3083 match socket {
3084 TaggedControlTube::Vm(tube) => match tube.recv::<VmRequest>() {
3085 Ok(request) => {
3086 let (response, suspend_requested, run_mode_opt) =
3087 process_vm_request(state, id, tube, request, &mut add_tubes)?;
3088
3089 if let Some(response) = response {
3090 // If suspend requested skip that step since it will be
3091 // performed by s2idle_wait thread when suspension actually
3092 // happens.
3093 if !suspend_requested {
3094 if let Err(e) = tube.send(&response) {
3095 error!("failed to send VmResponse: {}", e);
3096 }
3097 }
3098 }
3099
3100 if let Some(run_mode) = run_mode_opt {
3101 info!("control socket changed run mode to {}", run_mode);
3102 match run_mode {
3103 VmRunMode::Exiting => {
3104 return Ok((true, Vec::new(), Vec::new()));
3105 }
3106 other => {
3107 if other == VmRunMode::Running {
3108 for dev in &state.linux.resume_notify_devices {
3109 dev.lock().resume_imminent();
3110 }
3111 }
3112 // If suspend requested skip that step since it
3113 // will be performed by s2idle_wait thread when
3114 // needed.
3115 if !suspend_requested {
3116 vcpu::kick_all_vcpus(
3117 state.vcpu_handles,
3118 state.linux.irq_chip.as_irq_chip(),
3119 VcpuControl::RunState(other),
3120 );
3121 }
3122 }
3123 }
3124 }
3125 }
3126 Err(e) => {
3127 if let TubeError::Disconnected = e {
3128 vm_control_ids_to_remove.push(id);
3129 } else {
3130 error!("failed to recv VmRequest: {}", e);
3131 }
3132 }
3133 },
3134 TaggedControlTube::VmMsync(tube) => match tube.recv::<VmMsyncRequest>() {
3135 Ok(request) => {
3136 let response = request.execute(&mut state.linux.vm);
3137 if let Err(e) = tube.send(&response) {
3138 error!("failed to send VmMsyncResponse: {}", e);
3139 }
3140 }
3141 Err(e) => {
3142 if let TubeError::Disconnected = e {
3143 vm_control_ids_to_remove.push(id);
3144 } else {
3145 error!("failed to recv VmMsyncRequest: {}", e);
3146 }
3147 }
3148 },
3149 TaggedControlTube::Fs(tube) => match tube.recv::<FsMappingRequest>() {
3150 Ok(request) => {
3151 let response =
3152 request.execute(&mut state.linux.vm, &mut state.sys_allocator.lock());
3153 if let Err(e) = tube.send(&response) {
3154 error!("failed to send VmResponse: {}", e);
3155 }
3156 }
3157 Err(e) => {
3158 if let TubeError::Disconnected = e {
3159 vm_control_ids_to_remove.push(id);
3160 } else {
3161 error!("failed to recv VmResponse: {}", e);
3162 }
3163 }
3164 },
3165 }
3166
3167 Ok((false, vm_control_ids_to_remove, add_tubes))
3168 }
3169
3170 #[cfg(feature = "registered_events")]
3171 struct AddressedProtoTube {
3172 tube: Rc<ProtoTube>,
3173 socket_addr: String,
3174 }
3175
3176 #[cfg(feature = "registered_events")]
3177 impl PartialEq for AddressedProtoTube {
eq(&self, other: &Self) -> bool3178 fn eq(&self, other: &Self) -> bool {
3179 self.socket_addr == other.socket_addr
3180 }
3181 }
3182
3183 #[cfg(feature = "registered_events")]
3184 impl Eq for AddressedProtoTube {}
3185
3186 #[cfg(feature = "registered_events")]
3187 impl Hash for AddressedProtoTube {
hash<H: std::hash::Hasher>(&self, state: &mut H)3188 fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
3189 self.socket_addr.hash(state);
3190 }
3191 }
3192
3193 #[cfg(feature = "registered_events")]
3194 impl AddressedProtoTube {
send<M: protobuf::Message>(&self, msg: &M) -> Result<(), base::TubeError>3195 pub fn send<M: protobuf::Message>(&self, msg: &M) -> Result<(), base::TubeError> {
3196 self.tube.send_proto(msg)
3197 }
3198 }
3199
3200 #[cfg(feature = "registered_events")]
find_registered_tube<'a>( registered_tubes: &'a HashMap<RegisteredEvent, HashSet<AddressedProtoTube>>, socket_addr: &str, event: RegisteredEvent, ) -> (Option<&'a Rc<ProtoTube>>, bool)3201 fn find_registered_tube<'a>(
3202 registered_tubes: &'a HashMap<RegisteredEvent, HashSet<AddressedProtoTube>>,
3203 socket_addr: &str,
3204 event: RegisteredEvent,
3205 ) -> (Option<&'a Rc<ProtoTube>>, bool) {
3206 let mut registered_tube: Option<&Rc<ProtoTube>> = None;
3207 let mut already_registered = false;
3208 'outer: for (evt, addr_tubes) in registered_tubes {
3209 for addr_tube in addr_tubes {
3210 if addr_tube.socket_addr == socket_addr {
3211 if *evt == event {
3212 already_registered = true;
3213 break 'outer;
3214 }
3215 // Since all tubes of the same addr should
3216 // be an RC to the same tube, it doesn't
3217 // matter which one we get. But we do need
3218 // to check for a registration for the
3219 // current event, so can't break here.
3220 registered_tube = Some(&addr_tube.tube);
3221 }
3222 }
3223 }
3224 (registered_tube, already_registered)
3225 }
3226
3227 #[cfg(feature = "registered_events")]
make_addr_tube_from_maybe_existing( tube: Option<&Rc<ProtoTube>>, addr: String, ) -> Result<AddressedProtoTube>3228 fn make_addr_tube_from_maybe_existing(
3229 tube: Option<&Rc<ProtoTube>>,
3230 addr: String,
3231 ) -> Result<AddressedProtoTube> {
3232 if let Some(registered_tube) = tube {
3233 Ok(AddressedProtoTube {
3234 tube: registered_tube.clone(),
3235 socket_addr: addr,
3236 })
3237 } else {
3238 let sock = UnixSeqpacket::connect(addr.clone()).with_context(|| {
3239 format!("failed to connect to registered listening socket {}", addr)
3240 })?;
3241 let tube = ProtoTube::new_from_unix_seqpacket(sock)?;
3242 Ok(AddressedProtoTube {
3243 tube: Rc::new(tube),
3244 socket_addr: addr,
3245 })
3246 }
3247 }
3248
run_control<V: VmArch + 'static, Vcpu: VcpuArch + 'static>( mut linux: RunnableLinuxVm<V, Vcpu>, sys_allocator: SystemAllocator, cfg: Config, control_server_socket: Option<UnlinkUnixSeqpacketListener>, irq_control_tubes: Vec<Tube>, vm_memory_control_tubes: Vec<VmMemoryTube>, control_tubes: Vec<TaggedControlTube>, #[cfg(feature = "balloon")] balloon_host_tube: Option<Tube>, disk_host_tubes: &[Tube], #[cfg(feature = "gpu")] gpu_control_tube: Tube, #[cfg(feature = "usb")] usb_control_tube: Tube, vm_evt_rdtube: RecvTube, vm_evt_wrtube: SendTube, sigchld_fd: SignalFd, gralloc: RutabagaGralloc, vcpu_ids: Vec<usize>, iommu_host_tube: Option<Tube>, #[cfg(target_arch = "x86_64")] hp_control_tube: mpsc::Sender<PciRootCommand>, #[cfg(target_arch = "x86_64")] hp_thread: std::thread::JoinHandle<()>, #[cfg(feature = "pci-hotplug")] mut hotplug_manager: Option<PciHotPlugManager>, #[allow(unused_mut)] #[cfg(feature = "swap")] mut swap_controller: Option<SwapController>, #[cfg(feature = "registered_events")] reg_evt_rdtube: RecvTube, guest_suspended_cvar: Option<Arc<(Mutex<bool>, Condvar)>>, #[cfg(feature = "pvclock")] pvclock_host_tube: Option<Tube>, metrics_tube: RecvTube, ) -> Result<ExitState>3249 fn run_control<V: VmArch + 'static, Vcpu: VcpuArch + 'static>(
3250 mut linux: RunnableLinuxVm<V, Vcpu>,
3251 sys_allocator: SystemAllocator,
3252 cfg: Config,
3253 control_server_socket: Option<UnlinkUnixSeqpacketListener>,
3254 irq_control_tubes: Vec<Tube>,
3255 vm_memory_control_tubes: Vec<VmMemoryTube>,
3256 control_tubes: Vec<TaggedControlTube>,
3257 #[cfg(feature = "balloon")] balloon_host_tube: Option<Tube>,
3258 disk_host_tubes: &[Tube],
3259 #[cfg(feature = "gpu")] gpu_control_tube: Tube,
3260 #[cfg(feature = "usb")] usb_control_tube: Tube,
3261 vm_evt_rdtube: RecvTube,
3262 vm_evt_wrtube: SendTube,
3263 sigchld_fd: SignalFd,
3264 gralloc: RutabagaGralloc,
3265 vcpu_ids: Vec<usize>,
3266 iommu_host_tube: Option<Tube>,
3267 #[cfg(target_arch = "x86_64")] hp_control_tube: mpsc::Sender<PciRootCommand>,
3268 #[cfg(target_arch = "x86_64")] hp_thread: std::thread::JoinHandle<()>,
3269 #[cfg(feature = "pci-hotplug")] mut hotplug_manager: Option<PciHotPlugManager>,
3270 #[allow(unused_mut)] // mut is required x86 only
3271 #[cfg(feature = "swap")]
3272 mut swap_controller: Option<SwapController>,
3273 #[cfg(feature = "registered_events")] reg_evt_rdtube: RecvTube,
3274 guest_suspended_cvar: Option<Arc<(Mutex<bool>, Condvar)>>,
3275 #[cfg(feature = "pvclock")] pvclock_host_tube: Option<Tube>,
3276 metrics_tube: RecvTube,
3277 ) -> Result<ExitState> {
3278 #[derive(EventToken)]
3279 enum Token {
3280 VmEvent,
3281 Suspend,
3282 ChildSignal,
3283 VmControlServer,
3284 VmControl {
3285 id: usize,
3286 },
3287 #[cfg(feature = "registered_events")]
3288 RegisteredEvent,
3289 #[cfg(feature = "balloon")]
3290 BalloonTube,
3291 }
3292 stdin()
3293 .set_raw_mode()
3294 .expect("failed to set terminal raw mode");
3295
3296 let sys_allocator_mutex = Arc::new(Mutex::new(sys_allocator));
3297 let iommu_host_tube = iommu_host_tube.map(|t| Arc::new(Mutex::new(t)));
3298
3299 let wait_ctx = WaitContext::build_with(&[
3300 (&linux.suspend_evt, Token::Suspend),
3301 (&sigchld_fd, Token::ChildSignal),
3302 (&vm_evt_rdtube, Token::VmEvent),
3303 #[cfg(feature = "registered_events")]
3304 (®_evt_rdtube, Token::RegisteredEvent),
3305 ])
3306 .context("failed to build wait context")?;
3307
3308 if let Some(socket_server) = &control_server_socket {
3309 wait_ctx
3310 .add(socket_server, Token::VmControlServer)
3311 .context("failed to add descriptor to wait context")?;
3312 }
3313 let mut control_tubes = BTreeMap::from_iter(control_tubes.into_iter().enumerate());
3314 let mut next_control_id = control_tubes.len();
3315 for (id, socket) in control_tubes.iter() {
3316 wait_ctx
3317 .add(socket.as_ref(), Token::VmControl { id: *id })
3318 .context("failed to add descriptor to wait context")?;
3319 }
3320
3321 #[cfg(feature = "balloon")]
3322 let mut balloon_tube = balloon_host_tube
3323 .map(|tube| -> Result<BalloonTube> {
3324 wait_ctx
3325 .add(&tube, Token::BalloonTube)
3326 .context("failed to add descriptor to wait context")?;
3327 Ok(BalloonTube::new(tube))
3328 })
3329 .transpose()
3330 .context("failed to create balloon tube")?;
3331
3332 if cfg.jail_config.is_some() {
3333 // Before starting VCPUs, in case we started with some capabilities, drop them all.
3334 drop_capabilities().context("failed to drop process capabilities")?;
3335 }
3336
3337 #[cfg(feature = "gdb")]
3338 // Create a channel for GDB thread.
3339 let (to_gdb_channel, from_vcpu_channel) = if linux.gdb.is_some() {
3340 let (s, r) = mpsc::channel();
3341 (Some(s), Some(r))
3342 } else {
3343 (None, None)
3344 };
3345
3346 let (device_ctrl_tube, device_ctrl_resp) = Tube::pair().context("failed to create tube")?;
3347 // Create devices thread, and restore if a restore file exists.
3348 linux.devices_thread = match create_devices_worker_thread(
3349 linux.vm.get_memory().clone(),
3350 linux.io_bus.clone(),
3351 linux.mmio_bus.clone(),
3352 device_ctrl_resp,
3353 ) {
3354 Ok(join_handle) => Some(join_handle),
3355 Err(e) => {
3356 return Err(anyhow!("Failed to start devices thread: {}", e));
3357 }
3358 };
3359
3360 let mut vcpu_handles = Vec::with_capacity(linux.vcpu_count);
3361 let vcpu_thread_barrier = Arc::new(Barrier::new(linux.vcpu_count + 1));
3362
3363 if !linux
3364 .vm
3365 .get_hypervisor()
3366 .check_capability(HypervisorCap::ImmediateExit)
3367 {
3368 return Err(anyhow!(
3369 "missing required hypervisor capability ImmediateExit"
3370 ));
3371 }
3372
3373 vcpu::setup_vcpu_signal_handler()?;
3374
3375 let vcpus: Vec<Option<_>> = match linux.vcpus.take() {
3376 Some(vec) => vec.into_iter().map(Some).collect(),
3377 None => iter::repeat_with(|| None).take(linux.vcpu_count).collect(),
3378 };
3379 // Enable core scheduling before creating vCPUs so that the cookie will be
3380 // shared by all vCPU threads.
3381 // TODO(b/199312402): Avoid enabling core scheduling for the crosvm process
3382 // itself for even better performance. Only vCPUs need the feature.
3383 if cfg.core_scheduling && cfg.per_vm_core_scheduling {
3384 if let Err(e) = enable_core_scheduling() {
3385 error!("Failed to enable core scheduling: {}", e);
3386 }
3387 }
3388 let vcpu_cgroup_tasks_file = match &cfg.vcpu_cgroup_path {
3389 None => None,
3390 Some(cgroup_path) => {
3391 // Move main process to cgroup_path
3392 let mut f = File::create(&cgroup_path.join("tasks")).with_context(|| {
3393 format!(
3394 "failed to create vcpu-cgroup-path {}",
3395 cgroup_path.display(),
3396 )
3397 })?;
3398 f.write_all(process::id().to_string().as_bytes())?;
3399 Some(f)
3400 }
3401 };
3402 #[cfg(target_arch = "x86_64")]
3403 let bus_lock_ratelimit_ctrl: Arc<Mutex<Ratelimit>> = Arc::new(Mutex::new(Ratelimit::new()));
3404 #[cfg(target_arch = "x86_64")]
3405 if cfg.bus_lock_ratelimit > 0 {
3406 let bus_lock_ratelimit = cfg.bus_lock_ratelimit;
3407 if linux.vm.check_capability(VmCap::BusLockDetect) {
3408 info!("Hypervisor support bus lock detect");
3409 linux
3410 .vm
3411 .enable_capability(VmCap::BusLockDetect, 0)
3412 .expect("kvm: Failed to enable bus lock detection cap");
3413 info!("Hypervisor enabled bus lock detect");
3414 bus_lock_ratelimit_ctrl
3415 .lock()
3416 .ratelimit_set_speed(bus_lock_ratelimit);
3417 } else {
3418 bail!("Kvm: bus lock detection unsuported");
3419 }
3420 }
3421
3422 #[cfg(target_os = "android")]
3423 android::set_process_profiles(&cfg.task_profiles)?;
3424
3425 #[allow(unused_mut)]
3426 let mut run_mode = if cfg.suspended {
3427 // Sleep devices before creating vcpus.
3428 device_ctrl_tube
3429 .send(&DeviceControlCommand::SleepDevices)
3430 .context("send command to devices control socket")?;
3431 match device_ctrl_tube
3432 .recv()
3433 .context("receive from devices control socket")?
3434 {
3435 VmResponse::Ok => (),
3436 resp => bail!("device sleep failed: {}", resp),
3437 }
3438 VmRunMode::Suspending
3439 } else {
3440 VmRunMode::Running
3441 };
3442 #[cfg(feature = "gdb")]
3443 if to_gdb_channel.is_some() {
3444 // Wait until a GDB client attaches
3445 run_mode = VmRunMode::Breakpoint;
3446 }
3447 // If we are restoring from a snapshot, then start suspended.
3448 let (run_mode, post_restore_run_mode) = if cfg.restore_path.is_some() {
3449 (VmRunMode::Suspending, run_mode)
3450 } else {
3451 (run_mode, run_mode)
3452 };
3453
3454 #[cfg(feature = "pvclock")]
3455 let pvclock_host_tube = pvclock_host_tube.map(Arc::new);
3456
3457 // Architecture-specific code must supply a vcpu_init element for each VCPU.
3458 assert_eq!(vcpus.len(), linux.vcpu_init.len());
3459
3460 for ((cpu_id, vcpu), vcpu_init) in vcpus.into_iter().enumerate().zip(linux.vcpu_init.drain(..))
3461 {
3462 let (to_vcpu_channel, from_main_channel) = mpsc::channel();
3463 let vcpu_affinity = match linux.vcpu_affinity.clone() {
3464 Some(VcpuAffinity::Global(v)) => v,
3465 Some(VcpuAffinity::PerVcpu(mut m)) => m.remove(&cpu_id).unwrap_or_default(),
3466 None => Default::default(),
3467 };
3468
3469 #[cfg(target_arch = "x86_64")]
3470 let vcpu_hybrid_type = if !cfg.vcpu_hybrid_type.is_empty() {
3471 Some(*cfg.vcpu_hybrid_type.get(&cpu_id).unwrap())
3472 } else {
3473 None
3474 };
3475
3476 #[cfg(target_arch = "x86_64")]
3477 let cpu_config = Some(CpuConfigX86_64::new(
3478 cfg.force_calibrated_tsc_leaf,
3479 cfg.host_cpu_topology,
3480 cfg.enable_hwp,
3481 cfg.no_smt,
3482 cfg.itmt,
3483 vcpu_hybrid_type,
3484 ));
3485 #[cfg(target_arch = "x86_64")]
3486 let bus_lock_ratelimit_ctrl = Arc::clone(&bus_lock_ratelimit_ctrl);
3487
3488 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
3489 let cpu_config = None;
3490
3491 #[cfg(target_arch = "riscv64")]
3492 let cpu_config = Some(CpuConfigRiscv64::new(vcpu_init.fdt_address));
3493
3494 let handle = vcpu::run_vcpu(
3495 cpu_id,
3496 vcpu_ids[cpu_id],
3497 vcpu,
3498 vcpu_init,
3499 linux.vm.try_clone().context("failed to clone vm")?,
3500 linux
3501 .irq_chip
3502 .try_box_clone()
3503 .context("failed to clone irqchip")?,
3504 linux.vcpu_count,
3505 linux.rt_cpus.contains(&cpu_id),
3506 vcpu_affinity,
3507 linux.delay_rt,
3508 vcpu_thread_barrier.clone(),
3509 (*linux.io_bus).clone(),
3510 (*linux.mmio_bus).clone(),
3511 vm_evt_wrtube
3512 .try_clone()
3513 .context("failed to clone vm event tube")?,
3514 from_main_channel,
3515 #[cfg(feature = "gdb")]
3516 to_gdb_channel.clone(),
3517 cfg.core_scheduling,
3518 cfg.per_vm_core_scheduling,
3519 cpu_config,
3520 match vcpu_cgroup_tasks_file {
3521 None => None,
3522 Some(ref f) => Some(
3523 f.try_clone()
3524 .context("failed to clone vcpu cgroup tasks file")?,
3525 ),
3526 },
3527 #[cfg(target_arch = "x86_64")]
3528 bus_lock_ratelimit_ctrl,
3529 run_mode,
3530 cfg.boost_uclamp,
3531 )?;
3532 vcpu_handles.push((handle, to_vcpu_channel));
3533 }
3534
3535 #[cfg(feature = "gdb")]
3536 // Spawn GDB thread.
3537 if let Some((gdb_port_num, gdb_control_tube)) = linux.gdb.take() {
3538 let to_vcpu_channels = vcpu_handles
3539 .iter()
3540 .map(|(_handle, channel)| channel.clone())
3541 .collect();
3542 let target = GdbStub::new(
3543 gdb_control_tube,
3544 to_vcpu_channels,
3545 from_vcpu_channel.unwrap(), // Must succeed to unwrap()
3546 );
3547 std::thread::Builder::new()
3548 .name("gdb".to_owned())
3549 .spawn(move || gdb_thread(target, gdb_port_num))
3550 .context("failed to spawn GDB thread")?;
3551 };
3552
3553 let (irq_handler_control, irq_handler_control_for_thread) = Tube::pair()?;
3554 let sys_allocator_for_thread = sys_allocator_mutex.clone();
3555 let irq_chip_for_thread = linux.irq_chip.try_box_clone()?;
3556 let irq_handler_thread = std::thread::Builder::new()
3557 .name("irq_handler_thread".into())
3558 .spawn(move || {
3559 irq_handler_thread(
3560 irq_control_tubes,
3561 irq_chip_for_thread,
3562 sys_allocator_for_thread,
3563 irq_handler_control_for_thread,
3564 )
3565 })
3566 .unwrap();
3567
3568 let (vm_memory_handler_control, vm_memory_handler_control_for_thread) = Tube::pair()?;
3569 let vm_memory_handler_thread = std::thread::Builder::new()
3570 .name("vm_memory_handler_thread".into())
3571 .spawn({
3572 let vm = linux.vm.try_clone().context("failed to clone Vm")?;
3573 let sys_allocator_mutex = sys_allocator_mutex.clone();
3574 let iommu_client = iommu_host_tube
3575 .as_ref()
3576 .map(|t| VmMemoryRequestIommuClient::new(t.clone()));
3577 move || {
3578 vm_memory_handler_thread(
3579 vm_memory_control_tubes,
3580 vm,
3581 sys_allocator_mutex,
3582 gralloc,
3583 iommu_client,
3584 vm_memory_handler_control_for_thread,
3585 )
3586 }
3587 })
3588 .unwrap();
3589
3590 vcpu_thread_barrier.wait();
3591
3592 // Restore VM (if applicable).
3593 // Must happen after the vCPU barrier to avoid deadlock.
3594 if let Some(path) = &cfg.restore_path {
3595 vm_control::do_restore(
3596 path.clone(),
3597 &linux.vm,
3598 |msg| vcpu::kick_all_vcpus(&vcpu_handles, linux.irq_chip.as_irq_chip(), msg),
3599 |msg, index| {
3600 vcpu::kick_vcpu(&vcpu_handles.get(index), linux.irq_chip.as_irq_chip(), msg)
3601 },
3602 &irq_handler_control,
3603 &device_ctrl_tube,
3604 linux.vcpu_count,
3605 |image| {
3606 linux
3607 .irq_chip
3608 .try_box_clone()?
3609 .restore(image, linux.vcpu_count)
3610 },
3611 /* require_encrypted= */ false,
3612 )?;
3613 // Allow the vCPUs to start for real.
3614 vcpu::kick_all_vcpus(
3615 &vcpu_handles,
3616 linux.irq_chip.as_irq_chip(),
3617 VcpuControl::RunState(post_restore_run_mode),
3618 )
3619 }
3620
3621 #[cfg(feature = "swap")]
3622 if let Some(swap_controller) = &swap_controller {
3623 swap_controller
3624 .on_static_devices_setup_complete()
3625 .context("static device setup complete")?;
3626 }
3627
3628 let metrics_thread = if metrics::is_initialized() {
3629 Some(
3630 std::thread::Builder::new()
3631 .name("metrics_thread".into())
3632 .spawn(move || {
3633 if let Err(e) = MetricsController::new(vec![metrics_tube]).run() {
3634 error!("Metrics controller error: {:?}", e);
3635 }
3636 })
3637 .context("metrics thread failed")?,
3638 )
3639 } else {
3640 None
3641 };
3642
3643 let mut exit_state = ExitState::Stop;
3644 let mut pvpanic_code = PvPanicCode::Unknown;
3645 #[cfg(feature = "registered_events")]
3646 let mut registered_evt_tubes: HashMap<RegisteredEvent, HashSet<AddressedProtoTube>> =
3647 HashMap::new();
3648
3649 'wait: loop {
3650 let events = {
3651 match wait_ctx.wait() {
3652 Ok(v) => v,
3653 Err(e) => {
3654 error!("failed to poll: {}", e);
3655 break;
3656 }
3657 }
3658 };
3659
3660 let mut vm_control_ids_to_remove = Vec::new();
3661 for event in events.iter().filter(|e| e.is_readable) {
3662 match event.token {
3663 #[cfg(feature = "registered_events")]
3664 Token::RegisteredEvent => match reg_evt_rdtube.recv::<RegisteredEventWithData>() {
3665 Ok(reg_evt) => {
3666 let evt = reg_evt.into_event();
3667 let mut tubes_to_remove: Vec<String> = Vec::new();
3668 if let Some(tubes) = registered_evt_tubes.get_mut(&evt) {
3669 for tube in tubes.iter() {
3670 if let Err(e) = tube.send(®_evt.into_proto()) {
3671 warn!(
3672 "failed to send registered event {:?} to {}, removing from \
3673 registrations: {}",
3674 reg_evt, tube.socket_addr, e
3675 );
3676 tubes_to_remove.push(tube.socket_addr.clone());
3677 }
3678 }
3679 }
3680 for tube_addr in tubes_to_remove {
3681 for tubes in registered_evt_tubes.values_mut() {
3682 tubes.retain(|t| t.socket_addr != tube_addr);
3683 }
3684 }
3685 registered_evt_tubes.retain(|_, tubes| !tubes.is_empty());
3686 }
3687 Err(e) => {
3688 warn!("failed to recv RegisteredEvent: {}", e);
3689 }
3690 },
3691 Token::VmEvent => {
3692 let mut break_to_wait: bool = true;
3693 match vm_evt_rdtube.recv::<VmEventType>() {
3694 Ok(vm_event) => match vm_event {
3695 VmEventType::Exit => {
3696 info!("vcpu requested shutdown");
3697 exit_state = ExitState::Stop;
3698 }
3699 VmEventType::Reset => {
3700 info!("vcpu requested reset");
3701 exit_state = ExitState::Reset;
3702 }
3703 VmEventType::Crash => {
3704 info!("vcpu crashed");
3705 exit_state = ExitState::Crash;
3706 }
3707 VmEventType::Panic(panic_code) => {
3708 pvpanic_code = PvPanicCode::from_u8(panic_code);
3709 info!("Guest reported panic [Code: {}]", pvpanic_code);
3710 break_to_wait = false;
3711 }
3712 VmEventType::WatchdogReset => {
3713 info!("vcpu stall detected");
3714 exit_state = ExitState::WatchdogReset;
3715 }
3716 },
3717 Err(e) => {
3718 warn!("failed to recv VmEvent: {}", e);
3719 }
3720 }
3721 if break_to_wait {
3722 if pvpanic_code == PvPanicCode::Panicked {
3723 exit_state = ExitState::GuestPanic;
3724 }
3725 break 'wait;
3726 }
3727 }
3728 Token::Suspend => {
3729 info!("VM requested suspend");
3730 linux.suspend_evt.wait().unwrap();
3731 vcpu::kick_all_vcpus(
3732 &vcpu_handles,
3733 linux.irq_chip.as_irq_chip(),
3734 VcpuControl::RunState(VmRunMode::Suspending),
3735 );
3736 }
3737 Token::ChildSignal => {
3738 // Print all available siginfo structs, then exit the loop if child process has
3739 // been exited except CLD_STOPPED and CLD_CONTINUED. the two should be ignored
3740 // here since they are used by the vmm-swap feature.
3741 let mut do_exit = false;
3742 while let Some(siginfo) =
3743 sigchld_fd.read().context("failed to read signalfd")?
3744 {
3745 let pid = siginfo.ssi_pid;
3746 let pid_label = match linux.pid_debug_label_map.get(&pid) {
3747 Some(label) => format!("{} (pid {})", label, pid),
3748 None => format!("pid {}", pid),
3749 };
3750
3751 // TODO(kawasin): this is a temporary exception until device suspension.
3752 #[cfg(feature = "swap")]
3753 if siginfo.ssi_code == libc::CLD_STOPPED
3754 || siginfo.ssi_code == libc::CLD_CONTINUED
3755 {
3756 continue;
3757 }
3758
3759 // Ignore clean exits of non-tracked child processes when running without
3760 // sandboxing. The virtio gpu process launches a render server for
3761 // pass-through graphics. Host GPU drivers have been observed to fork
3762 // child processes that exit cleanly which should not be considered a
3763 // crash. When running with sandboxing, this should be handled by the
3764 // device's process handler.
3765 if cfg.jail_config.is_none()
3766 && !linux.pid_debug_label_map.contains_key(&pid)
3767 && siginfo.ssi_signo == libc::SIGCHLD as u32
3768 && siginfo.ssi_code == libc::CLD_EXITED
3769 && siginfo.ssi_status == 0
3770 {
3771 continue;
3772 }
3773
3774 error!(
3775 "child {} exited: signo {}, status {}, code {}",
3776 pid_label, siginfo.ssi_signo, siginfo.ssi_status, siginfo.ssi_code
3777 );
3778 do_exit = true;
3779 }
3780 if do_exit {
3781 exit_state = ExitState::Crash;
3782 break 'wait;
3783 }
3784 }
3785 Token::VmControlServer => {
3786 if let Some(socket_server) = &control_server_socket {
3787 match socket_server.accept() {
3788 Ok(socket) => {
3789 let id = next_control_id;
3790 next_control_id += 1;
3791 wait_ctx
3792 .add(&socket, Token::VmControl { id })
3793 .context("failed to add descriptor to wait context")?;
3794 control_tubes.insert(
3795 id,
3796 TaggedControlTube::Vm(Tube::new_from_unix_seqpacket(socket)?),
3797 );
3798 }
3799 Err(e) => error!("failed to accept socket: {}", e),
3800 }
3801 }
3802 }
3803 Token::VmControl { id } => {
3804 if let Some(socket) = control_tubes.get(&id) {
3805 let mut state = ControlLoopState {
3806 linux: &mut linux,
3807 cfg: &cfg,
3808 sys_allocator: &sys_allocator_mutex,
3809 control_tubes: &control_tubes,
3810 disk_host_tubes,
3811 #[cfg(feature = "gpu")]
3812 gpu_control_tube: &gpu_control_tube,
3813 #[cfg(feature = "usb")]
3814 usb_control_tube: &usb_control_tube,
3815 #[cfg(target_arch = "x86_64")]
3816 iommu_host_tube: &iommu_host_tube,
3817 #[cfg(target_arch = "x86_64")]
3818 hp_control_tube: &hp_control_tube,
3819 guest_suspended_cvar: &guest_suspended_cvar,
3820 #[cfg(feature = "pci-hotplug")]
3821 hotplug_manager: &mut hotplug_manager,
3822 #[cfg(feature = "swap")]
3823 swap_controller: &mut swap_controller,
3824 vcpu_handles: &vcpu_handles,
3825 #[cfg(feature = "balloon")]
3826 balloon_tube: balloon_tube.as_mut(),
3827 device_ctrl_tube: &device_ctrl_tube,
3828 irq_handler_control: &irq_handler_control,
3829 #[cfg(any(target_arch = "x86_64", feature = "pci-hotplug"))]
3830 vm_memory_handler_control: &vm_memory_handler_control,
3831 #[cfg(feature = "registered_events")]
3832 registered_evt_tubes: &mut registered_evt_tubes,
3833 #[cfg(feature = "pvclock")]
3834 pvclock_host_tube: pvclock_host_tube.clone(),
3835 };
3836 let (exit_requested, mut ids_to_remove, add_tubes) =
3837 process_vm_control_event(&mut state, id, socket)?;
3838 if exit_requested {
3839 break 'wait;
3840 }
3841 vm_control_ids_to_remove.append(&mut ids_to_remove);
3842 for socket in add_tubes {
3843 let id = next_control_id;
3844 next_control_id += 1;
3845 wait_ctx
3846 .add(socket.as_ref(), Token::VmControl { id })
3847 .context(
3848 "failed to add hotplug vfio-pci descriptor to wait context",
3849 )?;
3850 control_tubes.insert(id, socket);
3851 }
3852 }
3853 }
3854 #[cfg(feature = "balloon")]
3855 Token::BalloonTube => {
3856 match balloon_tube.as_mut().expect("missing balloon tube").recv() {
3857 Ok(resp) => {
3858 for (resp, idx) in resp {
3859 if let Some(TaggedControlTube::Vm(tube)) = control_tubes.get(&idx) {
3860 if let Err(e) = tube.send(&resp) {
3861 error!("failed to send VmResponse: {}", e);
3862 }
3863 } else {
3864 error!("Bad tube index {}", idx);
3865 }
3866 }
3867 }
3868 Err(err) => {
3869 error!("Error processing balloon tube {:?}", err)
3870 }
3871 }
3872 }
3873 }
3874 }
3875
3876 remove_hungup_and_drained_tubes(
3877 &events,
3878 &wait_ctx,
3879 &mut control_tubes,
3880 vm_control_ids_to_remove,
3881 |token: &Token| {
3882 if let Token::VmControl { id } = token {
3883 return Some(*id);
3884 }
3885 None
3886 },
3887 )?;
3888 }
3889
3890 vcpu::kick_all_vcpus(
3891 &vcpu_handles,
3892 linux.irq_chip.as_irq_chip(),
3893 VcpuControl::RunState(VmRunMode::Exiting),
3894 );
3895 for (handle, _) in vcpu_handles {
3896 if let Err(e) = handle.join() {
3897 error!("failed to join vcpu thread: {:?}", e);
3898 }
3899 }
3900
3901 // After joining all vcpu threads, unregister the process-wide signal handler.
3902 if let Err(e) = vcpu::remove_vcpu_signal_handler() {
3903 error!("failed to remove vcpu thread signal handler: {:#}", e);
3904 }
3905
3906 // Stop the vmm-swap monitor process.
3907 #[cfg(feature = "swap")]
3908 drop(swap_controller);
3909
3910 // Stop pci root worker thread
3911 #[cfg(target_arch = "x86_64")]
3912 {
3913 let _ = hp_control_tube.send(PciRootCommand::Kill);
3914 if let Err(e) = hp_thread.join() {
3915 error!("failed to join hotplug thread: {:?}", e);
3916 }
3917 }
3918
3919 if linux.devices_thread.is_some() {
3920 if let Err(e) = device_ctrl_tube.send(&DeviceControlCommand::Exit) {
3921 error!("failed to stop device control loop: {}", e);
3922 };
3923 if let Some(thread) = linux.devices_thread.take() {
3924 if let Err(e) = thread.join() {
3925 error!("failed to exit devices thread: {:?}", e);
3926 }
3927 }
3928 }
3929
3930 // Shut down the VM Memory handler thread.
3931 if let Err(e) = vm_memory_handler_control.send(&VmMemoryHandlerRequest::Exit) {
3932 error!(
3933 "failed to request exit from VM Memory handler thread: {}",
3934 e
3935 );
3936 }
3937 if let Err(e) = vm_memory_handler_thread.join() {
3938 error!("failed to exit VM Memory handler thread: {:?}", e);
3939 }
3940
3941 // Shut down the IRQ handler thread.
3942 if let Err(e) = irq_handler_control.send(&IrqHandlerRequest::Exit) {
3943 error!("failed to request exit from IRQ handler thread: {}", e);
3944 }
3945 if let Err(e) = irq_handler_thread.join() {
3946 error!("failed to exit irq handler thread: {:?}", e);
3947 }
3948
3949 // At this point, the only remaining `Arc` references to the `Bus` objects should be the ones
3950 // inside `linux`. If the checks below fail, then some other thread is probably still running
3951 // and needs to be explicitly stopped before dropping `linux` to ensure devices actually get
3952 // cleaned up.
3953 match Arc::try_unwrap(std::mem::replace(
3954 &mut linux.mmio_bus,
3955 Arc::new(Bus::new(BusType::Mmio)),
3956 )) {
3957 Ok(_) => {}
3958 Err(_) => panic!("internal error: mmio_bus had more than one reference at shutdown"),
3959 }
3960 match Arc::try_unwrap(std::mem::replace(
3961 &mut linux.io_bus,
3962 Arc::new(Bus::new(BusType::Io)),
3963 )) {
3964 Ok(_) => {}
3965 Err(_) => panic!("internal error: io_bus had more than one reference at shutdown"),
3966 }
3967
3968 // Explicitly drop the VM structure here to allow the devices to clean up before the
3969 // control sockets are closed when this function exits.
3970 mem::drop(linux);
3971
3972 // Drop the hotplug manager to tell the warden process to exit before we try to join
3973 // the metrics thread.
3974 #[cfg(feature = "pci-hotplug")]
3975 mem::drop(hotplug_manager);
3976
3977 // All our children should have exited by now, so closing our fd should
3978 // terminate metrics. Then join so that everything gets flushed.
3979 metrics::get_destructor().cleanup();
3980 if let Some(metrics_thread) = metrics_thread {
3981 if let Err(e) = metrics_thread.join() {
3982 error!("failed to exit irq handler thread: {:?}", e);
3983 }
3984 }
3985
3986 stdin()
3987 .set_canon_mode()
3988 .expect("failed to restore canonical mode for terminal");
3989
3990 Ok(exit_state)
3991 }
3992
3993 #[derive(EventToken)]
3994 enum IrqHandlerToken {
3995 IrqFd { index: IrqEventIndex },
3996 VmIrq { id: usize },
3997 DelayedIrqFd,
3998 HandlerControl,
3999 }
4000
4001 /// Handles IRQs and requests from devices to add additional IRQ lines.
irq_handler_thread( irq_control_tubes: Vec<Tube>, mut irq_chip: Box<dyn IrqChipArch + 'static>, sys_allocator_mutex: Arc<Mutex<SystemAllocator>>, handler_control: Tube, ) -> anyhow::Result<()>4002 fn irq_handler_thread(
4003 irq_control_tubes: Vec<Tube>,
4004 mut irq_chip: Box<dyn IrqChipArch + 'static>,
4005 sys_allocator_mutex: Arc<Mutex<SystemAllocator>>,
4006 handler_control: Tube,
4007 ) -> anyhow::Result<()> {
4008 let wait_ctx = WaitContext::build_with(&[(
4009 handler_control.get_read_notifier(),
4010 IrqHandlerToken::HandlerControl,
4011 )])
4012 .context("failed to build wait context")?;
4013
4014 if let Some(delayed_ioapic_irq_trigger) = irq_chip.irq_delayed_event_token()? {
4015 wait_ctx
4016 .add(&delayed_ioapic_irq_trigger, IrqHandlerToken::DelayedIrqFd)
4017 .context("failed to add descriptor to wait context")?;
4018 }
4019
4020 let mut irq_event_tokens = irq_chip
4021 .irq_event_tokens()
4022 .context("failed get event tokens from irqchip")?;
4023
4024 for (index, _gsi, evt) in irq_event_tokens.iter() {
4025 wait_ctx
4026 .add(evt, IrqHandlerToken::IrqFd { index: *index })
4027 .context("failed to add irq chip event tokens to wait context")?;
4028 }
4029
4030 let mut irq_control_tubes = BTreeMap::from_iter(irq_control_tubes.into_iter().enumerate());
4031 let mut next_control_id = irq_control_tubes.len();
4032 for (id, socket) in irq_control_tubes.iter() {
4033 wait_ctx
4034 .add(
4035 socket.get_read_notifier(),
4036 IrqHandlerToken::VmIrq { id: *id },
4037 )
4038 .context("irq control tubes to wait context")?;
4039 }
4040
4041 'wait: loop {
4042 let events = {
4043 match wait_ctx.wait() {
4044 Ok(v) => v,
4045 Err(e) => {
4046 error!("failed to poll: {}", e);
4047 break 'wait;
4048 }
4049 }
4050 };
4051 let token_count = events.len();
4052 let mut vm_irq_tubes_to_remove = Vec::new();
4053 let mut notify_control_on_iteration_end = false;
4054
4055 for event in events.iter().filter(|e| e.is_readable) {
4056 match event.token {
4057 IrqHandlerToken::HandlerControl => {
4058 match handler_control.recv::<IrqHandlerRequest>() {
4059 Ok(request) => {
4060 match request {
4061 IrqHandlerRequest::Exit => break 'wait,
4062 IrqHandlerRequest::AddIrqControlTubes(tubes) => {
4063 for socket in tubes {
4064 let id = next_control_id;
4065 next_control_id += 1;
4066 wait_ctx
4067 .add(
4068 socket.get_read_notifier(),
4069 IrqHandlerToken::VmIrq { id },
4070 )
4071 .context("failed to add new IRQ control Tube to wait context")?;
4072 irq_control_tubes.insert(id, socket);
4073 }
4074 }
4075 IrqHandlerRequest::RefreshIrqEventTokens => {
4076 for (_index, _gsi, evt) in irq_event_tokens.iter() {
4077 wait_ctx.delete(evt).context(
4078 "failed to remove irq chip event \
4079 token from wait context",
4080 )?;
4081 }
4082
4083 irq_event_tokens = irq_chip
4084 .irq_event_tokens()
4085 .context("failed get event tokens from irqchip")?;
4086 for (index, _gsi, evt) in irq_event_tokens.iter() {
4087 wait_ctx
4088 .add(evt, IrqHandlerToken::IrqFd { index: *index })
4089 .context(
4090 "failed to add irq chip event \
4091 tokens to wait context",
4092 )?;
4093 }
4094
4095 if let Err(e) = handler_control
4096 .send(&IrqHandlerResponse::IrqEventTokenRefreshComplete)
4097 {
4098 error!(
4099 "failed to notify IRQ event token refresh \
4100 was completed: {}",
4101 e
4102 );
4103 }
4104 }
4105 IrqHandlerRequest::WakeAndNotifyIteration => {
4106 notify_control_on_iteration_end = true;
4107 }
4108 }
4109 }
4110 Err(e) => {
4111 if let TubeError::Disconnected = e {
4112 panic!("irq handler control tube disconnected.");
4113 } else {
4114 error!("failed to recv IrqHandlerRequest: {}", e);
4115 }
4116 }
4117 }
4118 }
4119 IrqHandlerToken::VmIrq { id } => {
4120 if let Some(tube) = irq_control_tubes.get(&id) {
4121 handle_irq_tube_request(
4122 &sys_allocator_mutex,
4123 &mut irq_chip,
4124 &mut vm_irq_tubes_to_remove,
4125 &wait_ctx,
4126 tube,
4127 id,
4128 );
4129 }
4130 }
4131 IrqHandlerToken::IrqFd { index } => {
4132 if let Err(e) = irq_chip.service_irq_event(index) {
4133 error!("failed to signal irq {}: {}", index, e);
4134 }
4135 }
4136 IrqHandlerToken::DelayedIrqFd => {
4137 if let Err(e) = irq_chip.process_delayed_irq_events() {
4138 warn!("can't deliver delayed irqs: {}", e);
4139 }
4140 }
4141 }
4142 }
4143
4144 if notify_control_on_iteration_end {
4145 if let Err(e) = handler_control.send(&IrqHandlerResponse::HandlerIterationComplete(
4146 token_count - 1,
4147 )) {
4148 error!(
4149 "failed to notify on iteration completion (snapshotting may fail): {}",
4150 e
4151 );
4152 }
4153 }
4154
4155 remove_hungup_and_drained_tubes(
4156 &events,
4157 &wait_ctx,
4158 &mut irq_control_tubes,
4159 vm_irq_tubes_to_remove,
4160 |token: &IrqHandlerToken| {
4161 if let IrqHandlerToken::VmIrq { id } = token {
4162 return Some(*id);
4163 }
4164 None
4165 },
4166 )?;
4167 if events.iter().any(|e| {
4168 e.is_hungup && !e.is_readable && matches!(e.token, IrqHandlerToken::HandlerControl)
4169 }) {
4170 error!("IRQ handler control hung up but did not request an exit.");
4171 break 'wait;
4172 }
4173 }
4174 Ok(())
4175 }
4176
handle_irq_tube_request( sys_allocator_mutex: &Arc<Mutex<SystemAllocator>>, irq_chip: &mut Box<dyn IrqChipArch + 'static>, vm_irq_tubes_to_remove: &mut Vec<usize>, wait_ctx: &WaitContext<IrqHandlerToken>, tube: &Tube, tube_index: usize, )4177 fn handle_irq_tube_request(
4178 sys_allocator_mutex: &Arc<Mutex<SystemAllocator>>,
4179 irq_chip: &mut Box<dyn IrqChipArch + 'static>,
4180 vm_irq_tubes_to_remove: &mut Vec<usize>,
4181 wait_ctx: &WaitContext<IrqHandlerToken>,
4182 tube: &Tube,
4183 tube_index: usize,
4184 ) {
4185 match tube.recv::<VmIrqRequest>() {
4186 Ok(request) => {
4187 let response = {
4188 request.execute(
4189 |setup| match setup {
4190 IrqSetup::Event(irq, ev, device_id, queue_id, device_name) => {
4191 let irq_evt = devices::IrqEdgeEvent::from_event(ev.try_clone()?);
4192 let source = IrqEventSource {
4193 device_id: device_id.try_into().expect("Invalid device_id"),
4194 queue_id,
4195 device_name,
4196 };
4197 if let Some(event_index) =
4198 irq_chip.register_edge_irq_event(irq, &irq_evt, source)?
4199 {
4200 if let Err(e) =
4201 wait_ctx.add(ev, IrqHandlerToken::IrqFd { index: event_index })
4202 {
4203 warn!("failed to add IrqFd to poll context: {}", e);
4204 return Err(e);
4205 }
4206 }
4207 Ok(())
4208 }
4209 IrqSetup::Route(route) => irq_chip.route_irq(route),
4210 IrqSetup::UnRegister(irq, ev) => {
4211 let irq_evt = devices::IrqEdgeEvent::from_event(ev.try_clone()?);
4212 irq_chip.unregister_edge_irq_event(irq, &irq_evt)
4213 }
4214 },
4215 &mut sys_allocator_mutex.lock(),
4216 )
4217 };
4218 if let Err(e) = tube.send(&response) {
4219 error!("failed to send VmIrqResponse: {}", e);
4220 }
4221 }
4222 Err(e) => {
4223 if let TubeError::Disconnected = e {
4224 vm_irq_tubes_to_remove.push(tube_index);
4225 } else {
4226 error!("failed to recv VmIrqRequest: {}", e);
4227 }
4228 }
4229 }
4230 }
4231
4232 /// Commands to control the VM Memory handler thread.
4233 #[derive(serde::Serialize, serde::Deserialize)]
4234 pub enum VmMemoryHandlerRequest {
4235 /// No response is sent for this command.
4236 AddControlTubes(Vec<VmMemoryTube>),
4237 /// No response is sent for this command.
4238 Exit,
4239 }
4240
vm_memory_handler_thread( control_tubes: Vec<VmMemoryTube>, mut vm: impl Vm, sys_allocator_mutex: Arc<Mutex<SystemAllocator>>, mut gralloc: RutabagaGralloc, mut iommu_client: Option<VmMemoryRequestIommuClient>, handler_control: Tube, ) -> anyhow::Result<()>4241 fn vm_memory_handler_thread(
4242 control_tubes: Vec<VmMemoryTube>,
4243 mut vm: impl Vm,
4244 sys_allocator_mutex: Arc<Mutex<SystemAllocator>>,
4245 mut gralloc: RutabagaGralloc,
4246 mut iommu_client: Option<VmMemoryRequestIommuClient>,
4247 handler_control: Tube,
4248 ) -> anyhow::Result<()> {
4249 #[derive(EventToken)]
4250 enum Token {
4251 VmControl { id: usize },
4252 HandlerControl,
4253 }
4254
4255 let wait_ctx =
4256 WaitContext::build_with(&[(handler_control.get_read_notifier(), Token::HandlerControl)])
4257 .context("failed to build wait context")?;
4258 let mut control_tubes = BTreeMap::from_iter(control_tubes.into_iter().enumerate());
4259 let mut next_control_id = control_tubes.len();
4260 for (id, socket) in control_tubes.iter() {
4261 wait_ctx
4262 .add(socket.as_ref(), Token::VmControl { id: *id })
4263 .context("failed to add descriptor to wait context")?;
4264 }
4265
4266 let mut region_state = VmMemoryRegionState::new();
4267
4268 'wait: loop {
4269 let events = {
4270 match wait_ctx.wait() {
4271 Ok(v) => v,
4272 Err(e) => {
4273 error!("failed to poll: {}", e);
4274 break;
4275 }
4276 }
4277 };
4278
4279 let mut vm_control_ids_to_remove = Vec::new();
4280 for event in events.iter().filter(|e| e.is_readable) {
4281 match event.token {
4282 Token::HandlerControl => match handler_control.recv::<VmMemoryHandlerRequest>() {
4283 Ok(request) => match request {
4284 VmMemoryHandlerRequest::Exit => break 'wait,
4285 VmMemoryHandlerRequest::AddControlTubes(tubes) => {
4286 for socket in tubes {
4287 let id = next_control_id;
4288 next_control_id += 1;
4289 wait_ctx
4290 .add(socket.get_read_notifier(), Token::VmControl { id })
4291 .context(
4292 "failed to add new vm memory control Tube to wait context",
4293 )?;
4294 control_tubes.insert(id, socket);
4295 }
4296 }
4297 },
4298 Err(e) => {
4299 if let TubeError::Disconnected = e {
4300 panic!("vm memory control tube disconnected.");
4301 } else {
4302 error!("failed to recv VmMemoryHandlerRequest: {}", e);
4303 }
4304 }
4305 },
4306 Token::VmControl { id } => {
4307 if let Some(VmMemoryTube {
4308 tube,
4309 expose_with_viommu,
4310 }) = control_tubes.get(&id)
4311 {
4312 match tube.recv::<VmMemoryRequest>() {
4313 Ok(request) => {
4314 let response = request.execute(
4315 &mut vm,
4316 &mut sys_allocator_mutex.lock(),
4317 &mut gralloc,
4318 if *expose_with_viommu {
4319 iommu_client.as_mut()
4320 } else {
4321 None
4322 },
4323 &mut region_state,
4324 );
4325 if let Err(e) = tube.send(&response) {
4326 error!("failed to send VmMemoryControlResponse: {}", e);
4327 }
4328 }
4329 Err(e) => {
4330 if let TubeError::Disconnected = e {
4331 vm_control_ids_to_remove.push(id);
4332 } else {
4333 error!("failed to recv VmMemoryControlRequest: {}", e);
4334 }
4335 }
4336 }
4337 }
4338 }
4339 }
4340 }
4341
4342 remove_hungup_and_drained_tubes(
4343 &events,
4344 &wait_ctx,
4345 &mut control_tubes,
4346 vm_control_ids_to_remove,
4347 |token: &Token| {
4348 if let Token::VmControl { id } = token {
4349 return Some(*id);
4350 }
4351 None
4352 },
4353 )?;
4354 if events
4355 .iter()
4356 .any(|e| e.is_hungup && !e.is_readable && matches!(e.token, Token::HandlerControl))
4357 {
4358 error!("vm memory handler control hung up but did not request an exit.");
4359 break 'wait;
4360 }
4361 }
4362 Ok(())
4363 }
4364
4365 /// When control tubes hang up, we want to make sure that we've fully drained
4366 /// the underlying socket before removing it. This function also handles
4367 /// removing closed sockets in such a way that avoids phantom events.
4368 ///
4369 /// `tube_ids_to_remove` is the set of ids that we already know should
4370 /// be removed (e.g. from getting a disconnect error on read).
remove_hungup_and_drained_tubes<T, U>( events: &SmallVec<[TriggeredEvent<T>; 16]>, wait_ctx: &WaitContext<T>, tubes: &mut BTreeMap<usize, U>, mut tube_ids_to_remove: Vec<usize>, get_tube_id: fn(token: &T) -> Option<usize>, ) -> anyhow::Result<()> where T: EventToken, U: ReadNotifier,4371 fn remove_hungup_and_drained_tubes<T, U>(
4372 events: &SmallVec<[TriggeredEvent<T>; 16]>,
4373 wait_ctx: &WaitContext<T>,
4374 tubes: &mut BTreeMap<usize, U>,
4375 mut tube_ids_to_remove: Vec<usize>,
4376 get_tube_id: fn(token: &T) -> Option<usize>,
4377 ) -> anyhow::Result<()>
4378 where
4379 T: EventToken,
4380 U: ReadNotifier,
4381 {
4382 // It's possible more data is readable and buffered while the socket is hungup,
4383 // so don't delete the tube from the poll context until we're sure all the
4384 // data is read.
4385 // Below case covers a condition where we have received a hungup event and the tube is not
4386 // readable.
4387 // In case of readable tube, once all data is read, any attempt to read more data on hungup
4388 // tube should fail. On such failure, we get Disconnected error and ids gets added to
4389 // tube_ids_to_remove by the time we reach here.
4390 for event in events.iter().filter(|e| e.is_hungup && !e.is_readable) {
4391 if let Some(id) = get_tube_id(&event.token) {
4392 tube_ids_to_remove.push(id);
4393 }
4394 }
4395
4396 tube_ids_to_remove.dedup();
4397 for id in tube_ids_to_remove {
4398 // Delete the socket from the `wait_ctx` synchronously. Otherwise, the kernel will do
4399 // this automatically when the FD inserted into the `wait_ctx` is closed after this
4400 // if-block, but this removal can be deferred unpredictably. In some instances where the
4401 // system is under heavy load, we can even get events returned by `wait_ctx` for an FD
4402 // that has already been closed. Because the token associated with that spurious event
4403 // now belongs to a different socket, the control loop will start to interact with
4404 // sockets that might not be ready to use. This can cause incorrect hangup detection or
4405 // blocking on a socket that will never be ready. See also: crbug.com/1019986
4406 if let Some(socket) = tubes.remove(&id) {
4407 wait_ctx
4408 .delete(socket.get_read_notifier())
4409 .context("failed to remove descriptor from wait context")?;
4410 }
4411 }
4412 Ok(())
4413 }
4414
4415 /// Start and jail a vhost-user device according to its configuration and a vhost listener string.
4416 ///
4417 /// The jailing business is nasty and potentially unsafe if done from the wrong context - do not
4418 /// call outside of `start_devices`!
4419 ///
4420 /// Returns the pid of the jailed device process.
jail_and_start_vu_device<T: VirtioDeviceBuilder>( jail_config: &Option<JailConfig>, params: T, vhost: &str, name: &str, ) -> anyhow::Result<(libc::pid_t, Option<Box<dyn std::any::Any>>)>4421 fn jail_and_start_vu_device<T: VirtioDeviceBuilder>(
4422 jail_config: &Option<JailConfig>,
4423 params: T,
4424 vhost: &str,
4425 name: &str,
4426 ) -> anyhow::Result<(libc::pid_t, Option<Box<dyn std::any::Any>>)> {
4427 let mut keep_rds = Vec::new();
4428
4429 base::syslog::push_descriptors(&mut keep_rds);
4430 cros_tracing::push_descriptors!(&mut keep_rds);
4431 metrics::push_descriptors(&mut keep_rds);
4432
4433 let jail_type = VirtioDeviceType::VhostUser;
4434
4435 // Create a jail from the configuration. If the configuration is `None`, `create_jail` will also
4436 // return `None` so fall back to an empty (i.e. non-constrained) Minijail.
4437 let jail = params
4438 .create_jail(jail_config, jail_type)
4439 .with_context(|| format!("failed to create jail for {}", name))?
4440 .ok_or(())
4441 .or_else(|_| Minijail::new())
4442 .with_context(|| format!("failed to create empty jail for {}", name))?;
4443
4444 // Create the device in the parent process, so the child does not need any privileges necessary
4445 // to do it (only runtime capabilities are required).
4446 let device = params
4447 .create_vhost_user_device(&mut keep_rds)
4448 .context("failed to create vhost-user device")?;
4449 let mut listener = VhostUserListener::new(vhost, Some(&mut keep_rds))
4450 .context("failed to create the vhost listener")?;
4451 let parent_resources = listener.take_parent_process_resources();
4452
4453 // Executor must be created before jail in order to prevent the jailed process from creating
4454 // unrestricted io_urings.
4455 let ex = Executor::new().context("Failed to create an Executor")?;
4456 keep_rds.extend(ex.as_raw_descriptors());
4457
4458 // Deduplicate the FDs since minijail expects them to be unique.
4459 keep_rds.sort_unstable();
4460 keep_rds.dedup();
4461
4462 // SAFETY:
4463 // Safe because we are keeping all the descriptors needed for the child to function.
4464 match unsafe { jail.fork(Some(&keep_rds)).context("error while forking")? } {
4465 0 => {
4466 // In the child process.
4467
4468 // Free memory for the resources managed by the parent, without running drop() on them.
4469 // The parent will do it as we exit.
4470 let _ = std::mem::ManuallyDrop::new(parent_resources);
4471
4472 // Make sure the child process does not survive its parent.
4473 // SAFETY: trivially safe
4474 if unsafe { libc::prctl(libc::PR_SET_PDEATHSIG, libc::SIGKILL) } < 0 {
4475 panic!("call to prctl(PR_SET_DEATHSIG, SIGKILL) failed. Aborting child process.");
4476 }
4477
4478 // Set the name for the thread.
4479 const MAX_LEN: usize = 15; // pthread_setname_np() limit on Linux
4480 let debug_label_trimmed = &name.as_bytes()[..std::cmp::min(MAX_LEN, name.len())];
4481 let thread_name = CString::new(debug_label_trimmed).unwrap();
4482 // SAFETY:
4483 // Safe because we trimmed the name to 15 characters (and pthread_setname_np will return
4484 // an error if we don't anyway).
4485 let _ = unsafe { libc::pthread_setname_np(libc::pthread_self(), thread_name.as_ptr()) };
4486
4487 // Run the device loop and terminate the child process once it exits.
4488 let res = match listener.run_device(ex, device) {
4489 Ok(()) => 0,
4490 Err(e) => {
4491 error!("error while running device {}: {:#}", name, e);
4492 1
4493 }
4494 };
4495 // SAFETY: trivially safe
4496 unsafe { libc::exit(res) };
4497 }
4498 pid => {
4499 // In the parent process. We will drop the device and listener when exiting this method.
4500 // This is fine as ownership for both has been transferred to the child process and they
4501 // will keep living there. We just retain `parent_resources` for things we are supposed
4502 // to clean up ourselves.
4503
4504 info!("process for device {} (PID {}) started", &name, pid);
4505 #[cfg(feature = "seccomp_trace")]
4506 debug!(
4507 "seccomp_trace {{\"event\": \"minijail_fork\", \"pid\": {}, \"name\": \"{}\", \"jail_addr\": \"0x{:x}\"}}",
4508 pid,
4509 &name,
4510 read_jail_addr(&jail)
4511 );
4512 Ok((pid, parent_resources))
4513 }
4514 }
4515 }
4516
process_vhost_user_control_request(tube: Tube, disk_host_tubes: &[Tube]) -> Result<()>4517 fn process_vhost_user_control_request(tube: Tube, disk_host_tubes: &[Tube]) -> Result<()> {
4518 let command = tube
4519 .recv::<VmRequest>()
4520 .context("failed to receive VmRequest")?;
4521 let resp = match command {
4522 VmRequest::DiskCommand {
4523 disk_index,
4524 ref command,
4525 } => match &disk_host_tubes.get(disk_index) {
4526 Some(tube) => handle_disk_command(command, tube),
4527 None => VmResponse::Err(base::Error::new(libc::ENODEV)),
4528 },
4529 request => {
4530 error!(
4531 "Request {:?} currently not supported in vhost user backend",
4532 request
4533 );
4534 VmResponse::Err(base::Error::new(libc::EPERM))
4535 }
4536 };
4537
4538 tube.send(&resp).context("failed to send VmResponse")?;
4539 Ok(())
4540 }
4541
start_vhost_user_control_server( control_server_socket: UnlinkUnixSeqpacketListener, disk_host_tubes: Vec<Tube>, )4542 fn start_vhost_user_control_server(
4543 control_server_socket: UnlinkUnixSeqpacketListener,
4544 disk_host_tubes: Vec<Tube>,
4545 ) {
4546 info!("Start vhost-user control server");
4547 loop {
4548 match control_server_socket.accept() {
4549 Ok(socket) => {
4550 let tube = match Tube::new_from_unix_seqpacket(socket) {
4551 Ok(tube) => tube,
4552 Err(e) => {
4553 error!("failed to open tube: {:#}", e);
4554 return;
4555 }
4556 };
4557 if let Err(e) = process_vhost_user_control_request(tube, &disk_host_tubes) {
4558 error!("failed to process control request: {:#}", e);
4559 }
4560 }
4561 Err(e) => {
4562 error!("failed to establish connection: {}", e);
4563 }
4564 }
4565 }
4566 }
4567
start_devices(opts: DevicesCommand) -> anyhow::Result<()>4568 pub fn start_devices(opts: DevicesCommand) -> anyhow::Result<()> {
4569 if let Some(async_executor) = opts.async_executor {
4570 Executor::set_default_executor_kind(async_executor)
4571 .context("Failed to set the default async executor")?;
4572 }
4573
4574 struct DeviceJailInfo {
4575 // Unique name for the device, in the form `foomatic-0`.
4576 name: String,
4577 _drop_resources: Option<Box<dyn std::any::Any>>,
4578 }
4579
4580 fn add_device<T: VirtioDeviceBuilder>(
4581 i: usize,
4582 device_params: T,
4583 vhost: &str,
4584 jail_config: &Option<JailConfig>,
4585 devices_jails: &mut BTreeMap<libc::pid_t, DeviceJailInfo>,
4586 ) -> anyhow::Result<()> {
4587 let name = format!("{}-{}", T::NAME, i);
4588
4589 let (pid, _drop_resources) =
4590 jail_and_start_vu_device::<T>(jail_config, device_params, vhost, &name)?;
4591
4592 devices_jails.insert(
4593 pid,
4594 DeviceJailInfo {
4595 name,
4596 _drop_resources,
4597 },
4598 );
4599
4600 Ok(())
4601 }
4602
4603 let mut devices_jails: BTreeMap<libc::pid_t, DeviceJailInfo> = BTreeMap::new();
4604
4605 let jail = if opts.disable_sandbox {
4606 None
4607 } else {
4608 Some(opts.jail)
4609 };
4610
4611 // Create control server socket
4612 let control_server_socket = opts.control_socket.map(|path| {
4613 UnlinkUnixSeqpacketListener(
4614 UnixSeqpacketListener::bind(path).expect("Could not bind socket"),
4615 )
4616 });
4617
4618 // Create serial devices.
4619 for (i, params) in opts.serial.iter().enumerate() {
4620 let serial_config = ¶ms.device;
4621 add_device(i, serial_config, ¶ms.vhost, &jail, &mut devices_jails)?;
4622 }
4623
4624 let mut disk_host_tubes = Vec::new();
4625 let control_socket_exists = control_server_socket.is_some();
4626 // Create block devices.
4627 for (i, params) in opts.block.iter().enumerate() {
4628 let tube = if control_socket_exists {
4629 let (host_tube, device_tube) = Tube::pair().context("failed to create tube")?;
4630 disk_host_tubes.push(host_tube);
4631 Some(device_tube)
4632 } else {
4633 None
4634 };
4635 let disk_config = DiskConfig::new(¶ms.device, tube);
4636 add_device(i, disk_config, ¶ms.vhost, &jail, &mut devices_jails)?;
4637 }
4638
4639 // Create vsock devices.
4640 for (i, params) in opts.vsock.iter().enumerate() {
4641 add_device(i, ¶ms.device, ¶ms.vhost, &jail, &mut devices_jails)?;
4642 }
4643
4644 // Create network devices.
4645 #[cfg(feature = "net")]
4646 for (i, params) in opts.net.iter().enumerate() {
4647 add_device(i, ¶ms.device, ¶ms.vhost, &jail, &mut devices_jails)?;
4648 }
4649
4650 // No device created, that's probably not intended - print the help in that case.
4651 if devices_jails.is_empty() {
4652 let err = DevicesCommand::from_args(
4653 &[&std::env::args().next().unwrap_or(String::from("crosvm"))],
4654 &["--help"],
4655 )
4656 .unwrap_err();
4657 println!("{}", err.output);
4658 return Ok(());
4659 }
4660
4661 let ex = Executor::new()?;
4662 if let Some(control_server_socket) = control_server_socket {
4663 // Start the control server in the parent process.
4664 ex.spawn_blocking(move || {
4665 start_vhost_user_control_server(control_server_socket, disk_host_tubes)
4666 })
4667 .detach();
4668 }
4669
4670 // Now wait for all device processes to return.
4671 while !devices_jails.is_empty() {
4672 match base::linux::wait_for_pid(-1, 0) {
4673 Err(e) => panic!("error waiting for child process to complete: {:#}", e),
4674 Ok((Some(pid), wait_status)) => match devices_jails.remove_entry(&pid) {
4675 Some((_, info)) => {
4676 if let Some(status) = wait_status.code() {
4677 info!(
4678 "process for device {} (PID {}) exited with code {}",
4679 &info.name, pid, status
4680 );
4681 } else if let Some(signal) = wait_status.signal() {
4682 warn!(
4683 "process for device {} (PID {}) has been killed by signal {:?}",
4684 &info.name, pid, signal,
4685 );
4686 }
4687 }
4688 None => error!("pid {} is not one of our device processes", pid),
4689 },
4690 // `wait_for_pid` will necessarily return a PID because we asked to it wait for one to
4691 // complete.
4692 Ok((None, _)) => unreachable!(),
4693 }
4694 }
4695
4696 info!("all device processes have exited");
4697
4698 Ok(())
4699 }
4700
4701 /// Setup crash reporting for a process. Each process MUST provide a unique `product_type` to avoid
4702 /// making crash reports incomprehensible.
4703 #[cfg(feature = "crash-report")]
setup_emulator_crash_reporting(_cfg: &Config) -> anyhow::Result<String>4704 pub fn setup_emulator_crash_reporting(_cfg: &Config) -> anyhow::Result<String> {
4705 crash_report::setup_crash_reporting(crash_report::CrashReportAttributes {
4706 product_type: "emulator".to_owned(),
4707 pipe_name: None,
4708 report_uuid: None,
4709 product_name: None,
4710 product_version: None,
4711 })
4712 }
4713
4714 #[cfg(test)]
4715 mod tests {
4716 use std::path::PathBuf;
4717
4718 use super::*;
4719
4720 // Create a file-backed mapping parameters struct with the given `address` and `size` and other
4721 // parameters set to default values.
test_file_backed_mapping(address: u64, size: u64) -> FileBackedMappingParameters4722 fn test_file_backed_mapping(address: u64, size: u64) -> FileBackedMappingParameters {
4723 FileBackedMappingParameters {
4724 address,
4725 size,
4726 path: PathBuf::new(),
4727 offset: 0,
4728 writable: false,
4729 sync: false,
4730 align: false,
4731 }
4732 }
4733
4734 #[test]
guest_mem_file_backed_mappings_overlap()4735 fn guest_mem_file_backed_mappings_overlap() {
4736 // Base case: no file mappings; output layout should be identical.
4737 assert_eq!(
4738 punch_holes_in_guest_mem_layout_for_mappings(
4739 vec![
4740 (GuestAddress(0), 0xD000_0000, Default::default()),
4741 (GuestAddress(0x1_0000_0000), 0x8_0000, Default::default()),
4742 ],
4743 &[]
4744 ),
4745 vec![
4746 (GuestAddress(0), 0xD000_0000, Default::default()),
4747 (GuestAddress(0x1_0000_0000), 0x8_0000, Default::default()),
4748 ]
4749 );
4750
4751 // File mapping that does not overlap guest memory.
4752 assert_eq!(
4753 punch_holes_in_guest_mem_layout_for_mappings(
4754 vec![
4755 (GuestAddress(0), 0xD000_0000, Default::default()),
4756 (GuestAddress(0x1_0000_0000), 0x8_0000, Default::default()),
4757 ],
4758 &[test_file_backed_mapping(0xD000_0000, 0x1000)]
4759 ),
4760 vec![
4761 (GuestAddress(0), 0xD000_0000, Default::default()),
4762 (GuestAddress(0x1_0000_0000), 0x8_0000, Default::default()),
4763 ]
4764 );
4765
4766 // File mapping at the start of the low address space region.
4767 assert_eq!(
4768 punch_holes_in_guest_mem_layout_for_mappings(
4769 vec![
4770 (GuestAddress(0), 0xD000_0000, Default::default()),
4771 (GuestAddress(0x1_0000_0000), 0x8_0000, Default::default()),
4772 ],
4773 &[test_file_backed_mapping(0, 0x2000)]
4774 ),
4775 vec![
4776 (
4777 GuestAddress(0x2000),
4778 0xD000_0000 - 0x2000,
4779 Default::default()
4780 ),
4781 (GuestAddress(0x1_0000_0000), 0x8_0000, Default::default()),
4782 ]
4783 );
4784
4785 // File mapping at the end of the low address space region.
4786 assert_eq!(
4787 punch_holes_in_guest_mem_layout_for_mappings(
4788 vec![
4789 (GuestAddress(0), 0xD000_0000, Default::default()),
4790 (GuestAddress(0x1_0000_0000), 0x8_0000, Default::default()),
4791 ],
4792 &[test_file_backed_mapping(0xD000_0000 - 0x2000, 0x2000)]
4793 ),
4794 vec![
4795 (GuestAddress(0), 0xD000_0000 - 0x2000, Default::default()),
4796 (GuestAddress(0x1_0000_0000), 0x8_0000, Default::default()),
4797 ]
4798 );
4799
4800 // File mapping fully contained within the middle of the low address space region.
4801 assert_eq!(
4802 punch_holes_in_guest_mem_layout_for_mappings(
4803 vec![
4804 (GuestAddress(0), 0xD000_0000, Default::default()),
4805 (GuestAddress(0x1_0000_0000), 0x8_0000, Default::default()),
4806 ],
4807 &[test_file_backed_mapping(0x1000, 0x2000)]
4808 ),
4809 vec![
4810 (GuestAddress(0), 0x1000, Default::default()),
4811 (
4812 GuestAddress(0x3000),
4813 0xD000_0000 - 0x3000,
4814 Default::default()
4815 ),
4816 (GuestAddress(0x1_0000_0000), 0x8_0000, Default::default()),
4817 ]
4818 );
4819
4820 // File mapping at the start of the high address space region.
4821 assert_eq!(
4822 punch_holes_in_guest_mem_layout_for_mappings(
4823 vec![
4824 (GuestAddress(0), 0xD000_0000, Default::default()),
4825 (GuestAddress(0x1_0000_0000), 0x8_0000, Default::default()),
4826 ],
4827 &[test_file_backed_mapping(0x1_0000_0000, 0x2000)]
4828 ),
4829 vec![
4830 (GuestAddress(0), 0xD000_0000, Default::default()),
4831 (
4832 GuestAddress(0x1_0000_2000),
4833 0x8_0000 - 0x2000,
4834 Default::default()
4835 ),
4836 ]
4837 );
4838
4839 // File mapping at the end of the high address space region.
4840 assert_eq!(
4841 punch_holes_in_guest_mem_layout_for_mappings(
4842 vec![
4843 (GuestAddress(0), 0xD000_0000, Default::default()),
4844 (GuestAddress(0x1_0000_0000), 0x8_0000, Default::default()),
4845 ],
4846 &[test_file_backed_mapping(0x1_0008_0000 - 0x2000, 0x2000)]
4847 ),
4848 vec![
4849 (GuestAddress(0), 0xD000_0000, Default::default()),
4850 (
4851 GuestAddress(0x1_0000_0000),
4852 0x8_0000 - 0x2000,
4853 Default::default()
4854 ),
4855 ]
4856 );
4857
4858 // File mapping fully contained within the middle of the high address space region.
4859 assert_eq!(
4860 punch_holes_in_guest_mem_layout_for_mappings(
4861 vec![
4862 (GuestAddress(0), 0xD000_0000, Default::default()),
4863 (GuestAddress(0x1_0000_0000), 0x8_0000, Default::default()),
4864 ],
4865 &[test_file_backed_mapping(0x1_0000_1000, 0x2000)]
4866 ),
4867 vec![
4868 (GuestAddress(0), 0xD000_0000, Default::default()),
4869 (GuestAddress(0x1_0000_0000), 0x1000, Default::default()),
4870 (
4871 GuestAddress(0x1_0000_3000),
4872 0x8_0000 - 0x3000,
4873 Default::default()
4874 ),
4875 ]
4876 );
4877
4878 // File mapping overlapping two guest memory regions.
4879 assert_eq!(
4880 punch_holes_in_guest_mem_layout_for_mappings(
4881 vec![
4882 (GuestAddress(0), 0xD000_0000, Default::default()),
4883 (GuestAddress(0x1_0000_0000), 0x8_0000, Default::default()),
4884 ],
4885 &[test_file_backed_mapping(0xA000_0000, 0x60002000)]
4886 ),
4887 vec![
4888 (GuestAddress(0), 0xA000_0000, Default::default()),
4889 (
4890 GuestAddress(0x1_0000_2000),
4891 0x8_0000 - 0x2000,
4892 Default::default()
4893 ),
4894 ]
4895 );
4896 }
4897 }
4898