1 // Copyright 2022 The ChromiumOS Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #[cfg(target_os = "android")]
6 mod android;
7 pub mod cmdline;
8 pub mod config;
9 mod device_helpers;
10 pub(crate) mod ext2;
11 #[cfg(feature = "gpu")]
12 pub(crate) mod gpu;
13 #[cfg(feature = "pci-hotplug")]
14 pub(crate) mod jail_warden;
15 #[cfg(feature = "pci-hotplug")]
16 pub(crate) mod pci_hotplug_helpers;
17 #[cfg(feature = "pci-hotplug")]
18 pub(crate) mod pci_hotplug_manager;
19 mod vcpu;
20
21 #[cfg(all(feature = "pvclock", target_arch = "aarch64"))]
22 use std::arch::asm;
23 use std::cmp::max;
24 use std::collections::BTreeMap;
25 use std::collections::BTreeSet;
26 #[cfg(feature = "registered_events")]
27 use std::collections::HashMap;
28 #[cfg(feature = "registered_events")]
29 use std::collections::HashSet;
30 use std::convert::TryInto;
31 use std::ffi::CString;
32 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
33 use std::fs::create_dir_all;
34 use std::fs::File;
35 use std::fs::OpenOptions;
36 #[cfg(feature = "registered_events")]
37 use std::hash::Hash;
38 use std::io::stdin;
39 use std::iter;
40 use std::mem;
41 #[cfg(target_arch = "x86_64")]
42 use std::ops::RangeInclusive;
43 use std::os::unix::process::ExitStatusExt;
44 use std::path::Path;
45 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
46 use std::path::PathBuf;
47 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
48 use std::process;
49 #[cfg(feature = "registered_events")]
50 use std::rc::Rc;
51 use std::sync::mpsc;
52 use std::sync::Arc;
53 use std::sync::Barrier;
54 use std::thread::JoinHandle;
55
56 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
57 use aarch64::AArch64 as Arch;
58 use acpi_tables::sdt::SDT;
59 use anyhow::anyhow;
60 use anyhow::bail;
61 use anyhow::Context;
62 use anyhow::Result;
63 use arch::DtbOverlay;
64 use arch::IrqChipArch;
65 use arch::LinuxArch;
66 use arch::RunnableLinuxVm;
67 use arch::VcpuAffinity;
68 use arch::VcpuArch;
69 use arch::VirtioDeviceStub;
70 use arch::VmArch;
71 use arch::VmComponents;
72 use arch::VmImage;
73 use argh::FromArgs;
74 use base::ReadNotifier;
75 #[cfg(feature = "balloon")]
76 use base::UnixSeqpacket;
77 use base::UnixSeqpacketListener;
78 use base::UnlinkUnixSeqpacketListener;
79 use base::*;
80 use cros_async::Executor;
81 use device_helpers::*;
82 use devices::create_devices_worker_thread;
83 use devices::serial_device::SerialHardware;
84 #[cfg(all(feature = "pvclock", target_arch = "x86_64"))]
85 use devices::tsc::get_tsc_sync_mitigations;
86 use devices::vfio::VfioContainerManager;
87 #[cfg(feature = "gpu")]
88 use devices::virtio;
89 #[cfg(any(feature = "video-decoder", feature = "video-encoder"))]
90 use devices::virtio::device_constants::video::VideoDeviceType;
91 #[cfg(feature = "gpu")]
92 use devices::virtio::gpu::EventDevice;
93 #[cfg(target_arch = "x86_64")]
94 use devices::virtio::memory_mapper::MemoryMapper;
95 use devices::virtio::memory_mapper::MemoryMapperTrait;
96 use devices::virtio::vhost::user::VhostUserConnectionTrait;
97 use devices::virtio::vhost::user::VhostUserListener;
98 #[cfg(feature = "balloon")]
99 use devices::virtio::BalloonFeatures;
100 #[cfg(feature = "pci-hotplug")]
101 use devices::virtio::NetParameters;
102 #[cfg(feature = "pci-hotplug")]
103 use devices::virtio::NetParametersMode;
104 use devices::virtio::VirtioDevice;
105 use devices::virtio::VirtioDeviceType;
106 use devices::Bus;
107 use devices::BusDeviceObj;
108 use devices::BusType;
109 use devices::CoIommuDev;
110 #[cfg(feature = "usb")]
111 use devices::DeviceProvider;
112 #[cfg(target_arch = "x86_64")]
113 use devices::HotPlugBus;
114 #[cfg(target_arch = "x86_64")]
115 use devices::HotPlugKey;
116 use devices::IommuDevType;
117 use devices::IrqEventIndex;
118 use devices::IrqEventSource;
119 #[cfg(feature = "pci-hotplug")]
120 use devices::NetResourceCarrier;
121 #[cfg(target_arch = "x86_64")]
122 use devices::PciAddress;
123 #[cfg(target_arch = "x86_64")]
124 use devices::PciBridge;
125 use devices::PciDevice;
126 #[cfg(target_arch = "x86_64")]
127 use devices::PciMmioMapper;
128 #[cfg(target_arch = "x86_64")]
129 use devices::PciRoot;
130 #[cfg(target_arch = "x86_64")]
131 use devices::PciRootCommand;
132 #[cfg(target_arch = "x86_64")]
133 use devices::PcieDownstreamPort;
134 #[cfg(target_arch = "x86_64")]
135 use devices::PcieHostPort;
136 #[cfg(target_arch = "x86_64")]
137 use devices::PcieRootPort;
138 #[cfg(target_arch = "x86_64")]
139 use devices::PcieUpstreamPort;
140 use devices::PvPanicCode;
141 use devices::PvPanicPciDevice;
142 #[cfg(feature = "pci-hotplug")]
143 use devices::ResourceCarrier;
144 use devices::StubPciDevice;
145 use devices::VirtioPciDevice;
146 #[cfg(feature = "usb")]
147 use devices::XhciController;
148 #[cfg(feature = "gpu")]
149 use gpu::*;
150 #[cfg(target_arch = "riscv64")]
151 use hypervisor::CpuConfigRiscv64;
152 #[cfg(target_arch = "x86_64")]
153 use hypervisor::CpuConfigX86_64;
154 use hypervisor::Hypervisor;
155 use hypervisor::HypervisorCap;
156 use hypervisor::MemCacheType;
157 use hypervisor::ProtectionType;
158 use hypervisor::Vm;
159 use hypervisor::VmCap;
160 use jail::*;
161 #[cfg(feature = "pci-hotplug")]
162 use jail_warden::JailWarden;
163 #[cfg(feature = "pci-hotplug")]
164 use jail_warden::JailWardenImpl;
165 #[cfg(feature = "pci-hotplug")]
166 use jail_warden::PermissiveJailWarden;
167 use libc;
168 use metrics::MetricsController;
169 use minijail::Minijail;
170 #[cfg(feature = "pci-hotplug")]
171 use pci_hotplug_manager::PciHotPlugManager;
172 use resources::AddressRange;
173 use resources::Alloc;
174 use resources::SystemAllocator;
175 #[cfg(target_arch = "riscv64")]
176 use riscv64::Riscv64 as Arch;
177 use rutabaga_gfx::RutabagaGralloc;
178 use rutabaga_gfx::RutabagaGrallocBackendFlags;
179 use smallvec::SmallVec;
180 #[cfg(feature = "swap")]
181 use swap::SwapController;
182 use sync::Condvar;
183 use sync::Mutex;
184 use vm_control::api::VmMemoryClient;
185 use vm_control::*;
186 use vm_memory::FileBackedMappingParameters;
187 use vm_memory::GuestAddress;
188 use vm_memory::GuestMemory;
189 use vm_memory::MemoryPolicy;
190 use vm_memory::MemoryRegionOptions;
191 #[cfg(target_arch = "x86_64")]
192 use x86_64::X8664arch as Arch;
193
194 use crate::crosvm::config::Config;
195 use crate::crosvm::config::Executable;
196 use crate::crosvm::config::HypervisorKind;
197 use crate::crosvm::config::InputDeviceOption;
198 use crate::crosvm::config::IrqChipKind;
199 use crate::crosvm::config::DEFAULT_TOUCH_DEVICE_HEIGHT;
200 use crate::crosvm::config::DEFAULT_TOUCH_DEVICE_WIDTH;
201 #[cfg(feature = "gdb")]
202 use crate::crosvm::gdb::gdb_thread;
203 #[cfg(feature = "gdb")]
204 use crate::crosvm::gdb::GdbStub;
205 #[cfg(target_arch = "x86_64")]
206 use crate::crosvm::ratelimit::Ratelimit;
207 use crate::crosvm::sys::cmdline::DevicesCommand;
208 use crate::crosvm::sys::config::SharedDir;
209 use crate::crosvm::sys::config::SharedDirKind;
210 use crate::crosvm::sys::platform::vcpu::VcpuPidTid;
211
212 const KVM_PATH: &str = "/dev/kvm";
213 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
214 #[cfg(feature = "geniezone")]
215 const GENIEZONE_PATH: &str = "/dev/gzvm";
216 #[cfg(all(any(target_arch = "arm", target_arch = "aarch64"), feature = "gunyah"))]
217 static GUNYAH_PATH: &str = "/dev/gunyah";
218
create_virtio_devices( cfg: &Config, vm: &mut impl VmArch, resources: &mut SystemAllocator, add_control_tube: &mut impl FnMut(AnyControlTube), #[cfg_attr(not(feature = "gpu"), allow(unused_variables))] vm_evt_wrtube: &SendTube, #[cfg(feature = "balloon")] balloon_inflate_tube: Option<Tube>, worker_process_pids: &mut BTreeSet<Pid>, #[cfg(feature = "gpu")] render_server_fd: Option<SafeDescriptor>, #[cfg(feature = "gpu")] has_vfio_gfx_device: bool, #[cfg(feature = "registered_events")] registered_evt_q: &SendTube, ) -> DeviceResult<Vec<VirtioDeviceStub>>219 fn create_virtio_devices(
220 cfg: &Config,
221 vm: &mut impl VmArch,
222 resources: &mut SystemAllocator,
223 add_control_tube: &mut impl FnMut(AnyControlTube),
224 #[cfg_attr(not(feature = "gpu"), allow(unused_variables))] vm_evt_wrtube: &SendTube,
225 #[cfg(feature = "balloon")] balloon_inflate_tube: Option<Tube>,
226 worker_process_pids: &mut BTreeSet<Pid>,
227 #[cfg(feature = "gpu")] render_server_fd: Option<SafeDescriptor>,
228 #[cfg(feature = "gpu")] has_vfio_gfx_device: bool,
229 #[cfg(feature = "registered_events")] registered_evt_q: &SendTube,
230 ) -> DeviceResult<Vec<VirtioDeviceStub>> {
231 let mut devs = Vec::new();
232
233 #[cfg(any(feature = "gpu", feature = "video-decoder", feature = "video-encoder"))]
234 let mut resource_bridges = Vec::<Tube>::new();
235
236 if !cfg.wayland_socket_paths.is_empty() {
237 #[cfg_attr(not(feature = "gpu"), allow(unused_mut))]
238 let mut wl_resource_bridge = None::<Tube>;
239
240 #[cfg(feature = "gpu")]
241 {
242 if cfg.gpu_parameters.is_some() {
243 let (wl_socket, gpu_socket) = Tube::pair().context("failed to create tube")?;
244 resource_bridges.push(gpu_socket);
245 wl_resource_bridge = Some(wl_socket);
246 }
247 }
248
249 devs.push(create_wayland_device(
250 cfg.protection_type,
251 cfg.jail_config.as_ref(),
252 &cfg.wayland_socket_paths,
253 wl_resource_bridge,
254 )?);
255 }
256
257 #[cfg(all(feature = "media", feature = "video-decoder"))]
258 let media_adapter_cfg = cfg
259 .media_decoder
260 .iter()
261 .map(|config| {
262 let (video_tube, gpu_tube) =
263 Tube::pair().expect("failed to create tube for media adapter");
264 resource_bridges.push(gpu_tube);
265 (video_tube, config.backend)
266 })
267 .collect::<Vec<_>>();
268
269 #[cfg(feature = "video-decoder")]
270 let video_dec_cfg = cfg
271 .video_dec
272 .iter()
273 .map(|config| {
274 let (video_tube, gpu_tube) =
275 Tube::pair().expect("failed to create tube for video decoder");
276 resource_bridges.push(gpu_tube);
277 (video_tube, config.backend)
278 })
279 .collect::<Vec<_>>();
280
281 #[cfg(feature = "video-encoder")]
282 let video_enc_cfg = cfg
283 .video_enc
284 .iter()
285 .map(|config| {
286 let (video_tube, gpu_tube) =
287 Tube::pair().expect("failed to create tube for video encoder");
288 resource_bridges.push(gpu_tube);
289 (video_tube, config.backend)
290 })
291 .collect::<Vec<_>>();
292
293 #[cfg(feature = "gpu")]
294 {
295 if let Some(gpu_parameters) = &cfg.gpu_parameters {
296 let mut event_devices = Vec::new();
297 if cfg.display_window_mouse {
298 let display_param = if gpu_parameters.display_params.is_empty() {
299 Default::default()
300 } else {
301 gpu_parameters.display_params[0].clone()
302 };
303 let (gpu_display_w, gpu_display_h) = display_param.get_virtual_display_size();
304
305 let (event_device_socket, virtio_dev_socket) =
306 StreamChannel::pair(BlockingMode::Nonblocking, FramingMode::Byte)
307 .context("failed to create socket")?;
308 let mut multi_touch_width = gpu_display_w;
309 let mut multi_touch_height = gpu_display_h;
310 let mut multi_touch_name = None;
311 for input in &cfg.virtio_input {
312 if let InputDeviceOption::MultiTouch {
313 width,
314 height,
315 name,
316 ..
317 } = input
318 {
319 if let Some(width) = width {
320 multi_touch_width = *width;
321 }
322 if let Some(height) = height {
323 multi_touch_height = *height;
324 }
325 if let Some(name) = name {
326 multi_touch_name = Some(name.as_str());
327 }
328 break;
329 }
330 }
331 let dev = virtio::input::new_multi_touch(
332 // u32::MAX is the least likely to collide with the indices generated above for
333 // the multi_touch options, which begin at 0.
334 u32::MAX,
335 virtio_dev_socket,
336 multi_touch_width,
337 multi_touch_height,
338 multi_touch_name,
339 virtio::base_features(cfg.protection_type),
340 )
341 .context("failed to set up mouse device")?;
342 devs.push(VirtioDeviceStub {
343 dev: Box::new(dev),
344 jail: simple_jail(cfg.jail_config.as_ref(), "input_device")?,
345 });
346 event_devices.push(EventDevice::touchscreen(event_device_socket));
347 }
348 if cfg.display_window_keyboard {
349 let (event_device_socket, virtio_dev_socket) =
350 StreamChannel::pair(BlockingMode::Nonblocking, FramingMode::Byte)
351 .context("failed to create socket")?;
352 let dev = virtio::input::new_keyboard(
353 // u32::MAX is the least likely to collide with the indices generated above for
354 // the multi_touch options, which begin at 0.
355 u32::MAX,
356 virtio_dev_socket,
357 virtio::base_features(cfg.protection_type),
358 )
359 .context("failed to set up keyboard device")?;
360 devs.push(VirtioDeviceStub {
361 dev: Box::new(dev),
362 jail: simple_jail(cfg.jail_config.as_ref(), "input_device")?,
363 });
364 event_devices.push(EventDevice::keyboard(event_device_socket));
365 }
366
367 let (gpu_control_host_tube, gpu_control_device_tube) =
368 Tube::pair().context("failed to create gpu tube")?;
369 add_control_tube(DeviceControlTube::Gpu(gpu_control_host_tube).into());
370 devs.push(create_gpu_device(
371 cfg,
372 vm_evt_wrtube,
373 gpu_control_device_tube,
374 resource_bridges,
375 render_server_fd,
376 has_vfio_gfx_device,
377 event_devices,
378 )?);
379 }
380 }
381
382 for (_, param) in cfg
383 .serial_parameters
384 .iter()
385 .filter(|(_k, v)| v.hardware == SerialHardware::VirtioConsole)
386 {
387 let dev =
388 param.create_virtio_device_and_jail(cfg.protection_type, cfg.jail_config.as_ref())?;
389 devs.push(dev);
390 }
391
392 for disk in &cfg.disks {
393 let (disk_host_tube, disk_device_tube) = Tube::pair().context("failed to create tube")?;
394 add_control_tube(DeviceControlTube::Disk(disk_host_tube).into());
395 let disk_config = DiskConfig::new(disk, Some(disk_device_tube));
396 devs.push(
397 disk_config
398 .create_virtio_device_and_jail(cfg.protection_type, cfg.jail_config.as_ref())?,
399 );
400 }
401
402 if !cfg.scsis.is_empty() {
403 let scsi_config = ScsiConfig(&cfg.scsis);
404 devs.push(
405 scsi_config
406 .create_virtio_device_and_jail(cfg.protection_type, cfg.jail_config.as_ref())?,
407 );
408 }
409
410 for (index, pmem_disk) in cfg.pmems.iter().enumerate() {
411 let (pmem_host_tube, pmem_device_tube) = Tube::pair().context("failed to create tube")?;
412 add_control_tube(TaggedControlTube::VmMsync(pmem_host_tube).into());
413 devs.push(create_pmem_device(
414 cfg.protection_type,
415 cfg.jail_config.as_ref(),
416 vm,
417 resources,
418 pmem_disk,
419 index,
420 pmem_device_tube,
421 )?);
422 }
423
424 for (index, pmem_ext2) in cfg.pmem_ext2.iter().enumerate() {
425 // Prepare a `VmMemoryClient` for pmem-ext2 device to send a request for mmap() and memory
426 // registeration.
427 let (pmem_ext2_host_tube, pmem_ext2_device_tube) =
428 Tube::pair().context("failed to create tube")?;
429 let vm_memory_client = VmMemoryClient::new(pmem_ext2_device_tube);
430 add_control_tube(
431 VmMemoryTube {
432 tube: pmem_ext2_host_tube,
433 expose_with_viommu: false,
434 }
435 .into(),
436 );
437 let (pmem_host_tube, pmem_device_tube) = Tube::pair().context("failed to create tube")?;
438 add_control_tube(TaggedControlTube::VmMsync(pmem_host_tube).into());
439 devs.push(create_pmem_ext2_device(
440 cfg.protection_type,
441 cfg.jail_config.as_ref(),
442 resources,
443 pmem_ext2,
444 index,
445 vm_memory_client,
446 pmem_device_tube,
447 worker_process_pids,
448 )?);
449 }
450
451 if cfg.rng {
452 devs.push(create_rng_device(
453 cfg.protection_type,
454 cfg.jail_config.as_ref(),
455 )?);
456 }
457
458 #[cfg(feature = "pvclock")]
459 if cfg.pvclock {
460 // pvclock gets a tube for handling suspend/resume requests from the main thread.
461 let (host_suspend_tube, suspend_tube) = Tube::pair().context("failed to create tube")?;
462 add_control_tube(DeviceControlTube::PvClock(host_suspend_tube).into());
463
464 let frequency: u64;
465 #[cfg(target_arch = "x86_64")]
466 {
467 let tsc_state = devices::tsc::tsc_state()?;
468 let tsc_sync_mitigations =
469 get_tsc_sync_mitigations(&tsc_state, cfg.vcpu_count.unwrap_or(1));
470 if tsc_state.core_grouping.size() > 1 {
471 // Host TSCs are not in sync. Log what mitigations are applied.
472 warn!(
473 "Host TSCs are not in sync, applying the following mitigations: {:?}",
474 tsc_sync_mitigations
475 );
476 }
477 frequency = tsc_state.frequency;
478 }
479 #[cfg(target_arch = "aarch64")]
480 {
481 let mut x: u64;
482 // SAFETY: This instruction have no side effect apart from storing the current timestamp
483 // frequency into the specified register.
484 unsafe {
485 asm!("mrs {x}, cntfrq_el0",
486 x = out(reg) x,
487 );
488 }
489 frequency = x;
490
491 // If unset, KVM defaults to an offset that is calculated from VM boot time. Explicitly
492 // set it to zero on boot. When updating the offset, we always set it to the total
493 // amount of time the VM has been suspended.
494 vm.set_counter_offset(0)
495 .context("failed to set up pvclock")?;
496 }
497 let dev = create_pvclock_device(
498 cfg.protection_type,
499 cfg.jail_config.as_ref(),
500 frequency,
501 suspend_tube,
502 )?;
503 devs.push(dev);
504 info!("virtio-pvclock is enabled for this vm");
505 }
506
507 #[cfg(feature = "vtpm")]
508 {
509 if cfg.vtpm_proxy {
510 devs.push(create_vtpm_proxy_device(
511 cfg.protection_type,
512 cfg.jail_config.as_ref(),
513 )?);
514 }
515 }
516
517 let mut keyboard_idx = 0;
518 let mut mouse_idx = 0;
519 let mut rotary_idx = 0;
520 let mut switches_idx = 0;
521 let mut multi_touch_idx = 0;
522 let mut single_touch_idx = 0;
523 let mut trackpad_idx = 0;
524 let mut multi_touch_trackpad_idx = 0;
525 let mut custom_idx = 0;
526 for input in &cfg.virtio_input {
527 let input_dev = match input {
528 InputDeviceOption::Evdev { path } => create_vinput_device(
529 cfg.protection_type,
530 cfg.jail_config.as_ref(),
531 path.as_path(),
532 )?,
533 InputDeviceOption::Keyboard { path } => {
534 let dev = create_keyboard_device(
535 cfg.protection_type,
536 cfg.jail_config.as_ref(),
537 path.as_path(),
538 keyboard_idx,
539 )?;
540 keyboard_idx += 1;
541 dev
542 }
543 InputDeviceOption::Mouse { path } => {
544 let dev = create_mouse_device(
545 cfg.protection_type,
546 cfg.jail_config.as_ref(),
547 path.as_path(),
548 mouse_idx,
549 )?;
550 mouse_idx += 1;
551 dev
552 }
553 InputDeviceOption::MultiTouch {
554 path,
555 width,
556 height,
557 name,
558 } => {
559 let mut width = *width;
560 let mut height = *height;
561 if multi_touch_idx == 0 {
562 if width.is_none() {
563 width = cfg.display_input_width;
564 }
565 if height.is_none() {
566 height = cfg.display_input_height;
567 }
568 }
569 let dev = create_multi_touch_device(
570 cfg.protection_type,
571 cfg.jail_config.as_ref(),
572 path.as_path(),
573 width.unwrap_or(DEFAULT_TOUCH_DEVICE_WIDTH),
574 height.unwrap_or(DEFAULT_TOUCH_DEVICE_HEIGHT),
575 name.as_deref(),
576 multi_touch_idx,
577 )?;
578 multi_touch_idx += 1;
579 dev
580 }
581 InputDeviceOption::Rotary { path } => {
582 let dev = create_rotary_device(
583 cfg.protection_type,
584 cfg.jail_config.as_ref(),
585 path.as_path(),
586 rotary_idx,
587 )?;
588 rotary_idx += 1;
589 dev
590 }
591 InputDeviceOption::SingleTouch {
592 path,
593 width,
594 height,
595 name,
596 } => {
597 let mut width = *width;
598 let mut height = *height;
599 if single_touch_idx == 0 {
600 if width.is_none() {
601 width = cfg.display_input_width;
602 }
603 if height.is_none() {
604 height = cfg.display_input_height;
605 }
606 }
607 let dev = create_single_touch_device(
608 cfg.protection_type,
609 cfg.jail_config.as_ref(),
610 path.as_path(),
611 width.unwrap_or(DEFAULT_TOUCH_DEVICE_WIDTH),
612 height.unwrap_or(DEFAULT_TOUCH_DEVICE_HEIGHT),
613 name.as_deref(),
614 single_touch_idx,
615 )?;
616 single_touch_idx += 1;
617 dev
618 }
619 InputDeviceOption::Switches { path } => {
620 let dev = create_switches_device(
621 cfg.protection_type,
622 cfg.jail_config.as_ref(),
623 path.as_path(),
624 switches_idx,
625 )?;
626 switches_idx += 1;
627 dev
628 }
629 InputDeviceOption::Trackpad {
630 path,
631 width,
632 height,
633 name,
634 } => {
635 let dev = create_trackpad_device(
636 cfg.protection_type,
637 cfg.jail_config.as_ref(),
638 path.as_path(),
639 width.unwrap_or(DEFAULT_TOUCH_DEVICE_WIDTH),
640 height.unwrap_or(DEFAULT_TOUCH_DEVICE_HEIGHT),
641 name.as_deref(),
642 trackpad_idx,
643 )?;
644 trackpad_idx += 1;
645 dev
646 }
647 InputDeviceOption::MultiTouchTrackpad {
648 path,
649 width,
650 height,
651 name,
652 } => {
653 let dev = create_multitouch_trackpad_device(
654 cfg.protection_type,
655 cfg.jail_config.as_ref(),
656 path.as_path(),
657 width.unwrap_or(DEFAULT_TOUCH_DEVICE_WIDTH),
658 height.unwrap_or(DEFAULT_TOUCH_DEVICE_HEIGHT),
659 name.as_deref(),
660 multi_touch_trackpad_idx,
661 )?;
662 multi_touch_trackpad_idx += 1;
663 dev
664 }
665 InputDeviceOption::Custom { path, config_path } => {
666 let dev = create_custom_device(
667 cfg.protection_type,
668 cfg.jail_config.as_ref(),
669 path.as_path(),
670 custom_idx,
671 config_path.clone(),
672 )?;
673 custom_idx += 1;
674 dev
675 }
676 };
677 devs.push(input_dev);
678 }
679
680 #[cfg(feature = "balloon")]
681 if cfg.balloon {
682 let balloon_device_tube = if let Some(ref path) = cfg.balloon_control {
683 Tube::try_from(UnixSeqpacket::connect(path).with_context(|| {
684 format!(
685 "failed to connect to balloon control socket {}",
686 path.display(),
687 )
688 })?)?
689 } else {
690 // Balloon gets a special socket so balloon requests can be forwarded
691 // from the main process.
692 let (host, device) = Tube::pair().context("failed to create tube")?;
693 add_control_tube(DeviceControlTube::Balloon(host).into());
694 device
695 };
696
697 let balloon_features = (cfg.balloon_page_reporting as u64)
698 << BalloonFeatures::PageReporting as u64
699 | (cfg.balloon_ws_reporting as u64) << BalloonFeatures::WSReporting as u64;
700
701 let init_balloon_size = if let Some(init_memory) = cfg.init_memory {
702 let init_memory_bytes = init_memory.saturating_mul(1024 * 1024);
703 let total_memory_bytes = vm.get_memory().memory_size();
704
705 if init_memory_bytes > total_memory_bytes {
706 bail!(
707 "initial memory {} cannot be greater than total memory {}",
708 init_memory,
709 total_memory_bytes / (1024 * 1024),
710 );
711 }
712
713 // The initial balloon size is the total memory size minus the initial memory size.
714 total_memory_bytes - init_memory_bytes
715 } else {
716 // No --init-mem specified; start with balloon completely deflated.
717 0
718 };
719
720 // The balloon device also needs a tube to communicate back to the main process to
721 // handle remapping memory dynamically.
722 let (dynamic_mapping_host_tube, dynamic_mapping_device_tube) =
723 Tube::pair().context("failed to create tube")?;
724 add_control_tube(
725 VmMemoryTube {
726 tube: dynamic_mapping_host_tube,
727 expose_with_viommu: false,
728 }
729 .into(),
730 );
731
732 devs.push(create_balloon_device(
733 cfg.protection_type,
734 cfg.jail_config.as_ref(),
735 balloon_device_tube,
736 balloon_inflate_tube,
737 init_balloon_size,
738 VmMemoryClient::new(dynamic_mapping_device_tube),
739 balloon_features,
740 #[cfg(feature = "registered_events")]
741 Some(
742 registered_evt_q
743 .try_clone()
744 .context("failed to clone registered_evt_q tube")?,
745 ),
746 cfg.balloon_ws_num_bins,
747 )?);
748 }
749
750 #[cfg(feature = "net")]
751 for opt in &cfg.net {
752 let dev =
753 opt.create_virtio_device_and_jail(cfg.protection_type, cfg.jail_config.as_ref())?;
754 devs.push(dev);
755 }
756
757 #[cfg(feature = "audio")]
758 {
759 for (card_index, virtio_snd) in cfg.virtio_snds.iter().enumerate() {
760 let (snd_host_tube, snd_device_tube) =
761 Tube::pair().context("failed to create tube for snd")?;
762 add_control_tube(DeviceControlTube::Snd(snd_host_tube).into());
763 let mut snd_params = virtio_snd.clone();
764 snd_params.card_index = card_index;
765 devs.push(create_virtio_snd_device(
766 cfg.protection_type,
767 cfg.jail_config.as_ref(),
768 snd_params,
769 snd_device_tube,
770 )?);
771 }
772 }
773
774 #[cfg(any(target_os = "android", target_os = "linux"))]
775 #[cfg(feature = "media")]
776 {
777 for v4l2_device in &cfg.v4l2_proxy {
778 devs.push(create_v4l2_device(cfg.protection_type, v4l2_device)?);
779 }
780 }
781
782 #[cfg(feature = "media")]
783 if cfg.simple_media_device {
784 devs.push(create_simple_media_device(cfg.protection_type)?);
785 }
786
787 #[cfg(all(feature = "media", feature = "video-decoder"))]
788 {
789 for (tube, backend) in media_adapter_cfg {
790 devs.push(create_virtio_media_adapter(
791 cfg.protection_type,
792 cfg.jail_config.as_ref(),
793 tube,
794 backend,
795 )?);
796 }
797 }
798
799 #[cfg(feature = "video-decoder")]
800 {
801 for (tube, backend) in video_dec_cfg {
802 register_video_device(
803 backend,
804 &mut devs,
805 tube,
806 cfg.protection_type,
807 cfg.jail_config.as_ref(),
808 VideoDeviceType::Decoder,
809 )?;
810 }
811 }
812
813 #[cfg(feature = "video-encoder")]
814 {
815 for (tube, backend) in video_enc_cfg {
816 register_video_device(
817 backend,
818 &mut devs,
819 tube,
820 cfg.protection_type,
821 cfg.jail_config.as_ref(),
822 VideoDeviceType::Encoder,
823 )?;
824 }
825 }
826
827 if let Some(vsock_config) = &cfg.vsock {
828 devs.push(
829 vsock_config
830 .create_virtio_device_and_jail(cfg.protection_type, cfg.jail_config.as_ref())?,
831 );
832 }
833
834 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
835 {
836 if cfg.vhost_scmi {
837 devs.push(create_vhost_scmi_device(
838 cfg.protection_type,
839 cfg.jail_config.as_ref(),
840 cfg.vhost_scmi_device.clone(),
841 )?);
842 }
843 }
844 for vhost_user_fs in &cfg.vhost_user_fs {
845 devs.push(create_vhost_user_fs_device(
846 cfg.protection_type,
847 vhost_user_fs,
848 )?);
849 }
850
851 for shared_dir in &cfg.shared_dirs {
852 let SharedDir {
853 src,
854 tag,
855 kind,
856 ugid,
857 uid_map,
858 gid_map,
859 fs_cfg,
860 p9_cfg,
861 } = shared_dir;
862
863 let dev = match kind {
864 SharedDirKind::FS => {
865 let (host_tube, device_tube) = Tube::pair().context("failed to create tube")?;
866 add_control_tube(TaggedControlTube::Fs(host_tube).into());
867
868 create_fs_device(
869 cfg.protection_type,
870 cfg.jail_config.as_ref(),
871 *ugid,
872 uid_map,
873 gid_map,
874 src,
875 tag,
876 fs_cfg.clone(),
877 device_tube,
878 )?
879 }
880 SharedDirKind::P9 => create_9p_device(
881 cfg.protection_type,
882 cfg.jail_config.as_ref(),
883 *ugid,
884 uid_map,
885 gid_map,
886 src,
887 tag,
888 p9_cfg.clone(),
889 )?,
890 };
891 devs.push(dev);
892 }
893
894 #[cfg(feature = "audio")]
895 if let Some(path) = &cfg.sound {
896 devs.push(create_sound_device(
897 path,
898 cfg.protection_type,
899 cfg.jail_config.as_ref(),
900 )?);
901 }
902
903 for opt in &cfg.vhost_user {
904 devs.push(create_vhost_user_frontend(
905 cfg.protection_type,
906 opt,
907 cfg.vhost_user_connect_timeout_ms,
908 )?);
909 }
910
911 Ok(devs)
912 }
913
create_devices( cfg: &Config, vm: &mut impl VmArch, resources: &mut SystemAllocator, add_control_tube: &mut impl FnMut(AnyControlTube), vm_evt_wrtube: &SendTube, iommu_attached_endpoints: &mut BTreeMap<u32, Arc<Mutex<Box<dyn MemoryMapperTrait>>>>, #[cfg(feature = "usb")] usb_provider: DeviceProvider, #[cfg(feature = "gpu")] render_server_fd: Option<SafeDescriptor>, iova_max_addr: &mut Option<u64>, #[cfg(feature = "registered_events")] registered_evt_q: &SendTube, vfio_container_manager: &mut VfioContainerManager, worker_process_pids: &mut BTreeSet<Pid>, ) -> DeviceResult<Vec<(Box<dyn BusDeviceObj>, Option<Minijail>)>>914 fn create_devices(
915 cfg: &Config,
916 vm: &mut impl VmArch,
917 resources: &mut SystemAllocator,
918 add_control_tube: &mut impl FnMut(AnyControlTube),
919 vm_evt_wrtube: &SendTube,
920 iommu_attached_endpoints: &mut BTreeMap<u32, Arc<Mutex<Box<dyn MemoryMapperTrait>>>>,
921 #[cfg(feature = "usb")] usb_provider: DeviceProvider,
922 #[cfg(feature = "gpu")] render_server_fd: Option<SafeDescriptor>,
923 iova_max_addr: &mut Option<u64>,
924 #[cfg(feature = "registered_events")] registered_evt_q: &SendTube,
925 vfio_container_manager: &mut VfioContainerManager,
926 // Stores a set of PID of child processes that are suppose to exit cleanly.
927 worker_process_pids: &mut BTreeSet<Pid>,
928 ) -> DeviceResult<Vec<(Box<dyn BusDeviceObj>, Option<Minijail>)>> {
929 let mut devices: Vec<(Box<dyn BusDeviceObj>, Option<Minijail>)> = Vec::new();
930 #[cfg(feature = "balloon")]
931 let mut balloon_inflate_tube: Option<Tube> = None;
932 #[cfg(feature = "gpu")]
933 let mut has_vfio_gfx_device = false;
934 if !cfg.vfio.is_empty() {
935 let mut coiommu_attached_endpoints = Vec::new();
936
937 for vfio_dev in &cfg.vfio {
938 let (dev, jail, viommu_mapper) = create_vfio_device(
939 cfg.jail_config.as_ref(),
940 vm,
941 resources,
942 add_control_tube,
943 &vfio_dev.path,
944 false,
945 None,
946 vfio_dev.guest_address,
947 Some(&mut coiommu_attached_endpoints),
948 vfio_dev.iommu,
949 vfio_dev.dt_symbol.clone(),
950 vfio_container_manager,
951 )?;
952 match dev {
953 VfioDeviceVariant::Pci(vfio_pci_device) => {
954 *iova_max_addr = Some(max(
955 vfio_pci_device.get_max_iova(),
956 iova_max_addr.unwrap_or(0),
957 ));
958
959 #[cfg(feature = "gpu")]
960 if vfio_pci_device.is_gfx() {
961 has_vfio_gfx_device = true;
962 }
963
964 if let Some(viommu_mapper) = viommu_mapper {
965 iommu_attached_endpoints.insert(
966 vfio_pci_device
967 .pci_address()
968 .context("not initialized")?
969 .to_u32(),
970 Arc::new(Mutex::new(Box::new(viommu_mapper))),
971 );
972 }
973
974 devices.push((Box::new(vfio_pci_device), jail));
975 }
976 VfioDeviceVariant::Platform(vfio_plat_dev) => {
977 devices.push((Box::new(vfio_plat_dev), jail));
978 }
979 }
980 }
981
982 if !coiommu_attached_endpoints.is_empty() || !iommu_attached_endpoints.is_empty() {
983 let mut buf = mem::MaybeUninit::<libc::rlimit64>::zeroed();
984 // SAFETY: trivially safe
985 let res = unsafe { libc::getrlimit64(libc::RLIMIT_MEMLOCK, buf.as_mut_ptr()) };
986 if res == 0 {
987 // SAFETY: safe because getrlimit64 has returned success.
988 let limit = unsafe { buf.assume_init() };
989 let rlim_new = limit.rlim_cur.saturating_add(vm.get_memory().memory_size());
990 let rlim_max = max(limit.rlim_max, rlim_new);
991 if limit.rlim_cur < rlim_new {
992 let limit_arg = libc::rlimit64 {
993 rlim_cur: rlim_new,
994 rlim_max,
995 };
996 // SAFETY: trivially safe
997 let res = unsafe { libc::setrlimit64(libc::RLIMIT_MEMLOCK, &limit_arg) };
998 if res != 0 {
999 bail!("Set rlimit failed");
1000 }
1001 }
1002 } else {
1003 bail!("Get rlimit failed");
1004 }
1005 }
1006 #[cfg(feature = "balloon")]
1007 let coiommu_tube: Option<Tube>;
1008 #[cfg(not(feature = "balloon"))]
1009 let coiommu_tube: Option<Tube> = None;
1010 if !coiommu_attached_endpoints.is_empty() {
1011 let vfio_container = vfio_container_manager
1012 .get_container(IommuDevType::CoIommu, None as Option<&Path>)
1013 .context("failed to get vfio container")?;
1014 let (coiommu_host_tube, coiommu_device_tube) =
1015 Tube::pair().context("failed to create coiommu tube")?;
1016 add_control_tube(
1017 VmMemoryTube {
1018 tube: coiommu_host_tube,
1019 expose_with_viommu: false,
1020 }
1021 .into(),
1022 );
1023 let vcpu_count = cfg.vcpu_count.unwrap_or(1) as u64;
1024 #[cfg(feature = "balloon")]
1025 match Tube::pair() {
1026 Ok((x, y)) => {
1027 coiommu_tube = Some(x);
1028 balloon_inflate_tube = Some(y);
1029 }
1030 Err(x) => return Err(x).context("failed to create coiommu tube"),
1031 }
1032 let dev = CoIommuDev::new(
1033 vm.get_memory().clone(),
1034 vfio_container,
1035 VmMemoryClient::new(coiommu_device_tube),
1036 coiommu_tube,
1037 coiommu_attached_endpoints,
1038 vcpu_count,
1039 cfg.coiommu_param.unwrap_or_default(),
1040 )
1041 .context("failed to create coiommu device")?;
1042
1043 devices.push((
1044 Box::new(dev),
1045 simple_jail(cfg.jail_config.as_ref(), "coiommu_device")?,
1046 ));
1047 }
1048 }
1049
1050 let stubs = create_virtio_devices(
1051 cfg,
1052 vm,
1053 resources,
1054 add_control_tube,
1055 vm_evt_wrtube,
1056 #[cfg(feature = "balloon")]
1057 balloon_inflate_tube,
1058 worker_process_pids,
1059 #[cfg(feature = "gpu")]
1060 render_server_fd,
1061 #[cfg(feature = "gpu")]
1062 has_vfio_gfx_device,
1063 #[cfg(feature = "registered_events")]
1064 registered_evt_q,
1065 )?;
1066
1067 for stub in stubs {
1068 let (msi_host_tube, msi_device_tube) = Tube::pair().context("failed to create tube")?;
1069 add_control_tube(AnyControlTube::IrqTube(msi_host_tube));
1070
1071 let shared_memory_tube = if stub.dev.get_shared_memory_region().is_some() {
1072 let (host_tube, device_tube) =
1073 Tube::pair().context("failed to create shared memory tube")?;
1074 add_control_tube(
1075 VmMemoryTube {
1076 tube: host_tube,
1077 expose_with_viommu: stub.dev.expose_shmem_descriptors_with_viommu(),
1078 }
1079 .into(),
1080 );
1081 Some(device_tube)
1082 } else {
1083 None
1084 };
1085
1086 let (ioevent_host_tube, ioevent_device_tube) =
1087 Tube::pair().context("failed to create ioevent tube")?;
1088 add_control_tube(
1089 VmMemoryTube {
1090 tube: ioevent_host_tube,
1091 expose_with_viommu: false,
1092 }
1093 .into(),
1094 );
1095
1096 let (host_tube, device_tube) =
1097 Tube::pair().context("failed to create device control tube")?;
1098 add_control_tube(TaggedControlTube::Vm(host_tube).into());
1099
1100 let dev = VirtioPciDevice::new(
1101 vm.get_memory().clone(),
1102 stub.dev,
1103 msi_device_tube,
1104 cfg.disable_virtio_intx,
1105 shared_memory_tube.map(VmMemoryClient::new),
1106 VmMemoryClient::new(ioevent_device_tube),
1107 device_tube,
1108 )
1109 .context("failed to create virtio pci dev")?;
1110
1111 devices.push((Box::new(dev) as Box<dyn BusDeviceObj>, stub.jail));
1112 }
1113
1114 #[cfg(feature = "usb")]
1115 if cfg.usb {
1116 // Create xhci controller.
1117 let usb_controller = Box::new(XhciController::new(
1118 vm.get_memory().clone(),
1119 Box::new(usb_provider),
1120 ));
1121 devices.push((
1122 usb_controller,
1123 simple_jail(cfg.jail_config.as_ref(), "xhci_device")?,
1124 ));
1125 }
1126
1127 for params in &cfg.stub_pci_devices {
1128 // Stub devices don't need jailing since they don't do anything.
1129 devices.push((Box::new(StubPciDevice::new(params)), None));
1130 }
1131
1132 devices.push((
1133 Box::new(PvPanicPciDevice::new(vm_evt_wrtube.try_clone()?)),
1134 None,
1135 ));
1136
1137 Ok(devices)
1138 }
1139
create_mmio_file_backed_mappings( cfg: &Config, vm: &mut impl Vm, resources: &mut SystemAllocator, ) -> Result<()>1140 fn create_mmio_file_backed_mappings(
1141 cfg: &Config,
1142 vm: &mut impl Vm,
1143 resources: &mut SystemAllocator,
1144 ) -> Result<()> {
1145 for mapping in &cfg.file_backed_mappings_mmio {
1146 let file = mapping
1147 .open()
1148 .context("failed to open file for file-backed mapping")?;
1149 let prot = if mapping.writable {
1150 Protection::read_write()
1151 } else {
1152 Protection::read()
1153 };
1154 let size = mapping
1155 .size
1156 .try_into()
1157 .context("Invalid size for file-backed mapping")?;
1158 let memory_mapping = MemoryMappingBuilder::new(size)
1159 .from_file(&file)
1160 .offset(mapping.offset)
1161 .protection(prot)
1162 .build()
1163 .context("failed to map backing file for file-backed mapping")?;
1164
1165 let mapping_range = AddressRange::from_start_and_size(mapping.address, mapping.size)
1166 .context("failed to convert to AddressRange")?;
1167 match resources.mmio_allocator_any().allocate_at(
1168 mapping_range,
1169 Alloc::FileBacked(mapping.address),
1170 "file-backed mapping".to_owned(),
1171 ) {
1172 // OutOfSpace just means that this mapping is not in the MMIO regions at all, so don't
1173 // consider it an error.
1174 // TODO(b/222769529): Reserve this region in a global memory address space allocator
1175 // once we have that so nothing else can accidentally overlap with it.
1176 Ok(()) | Err(resources::Error::OutOfSpace) => {}
1177 e => e.context("failed to allocate guest address for file-backed mapping")?,
1178 }
1179
1180 vm.add_memory_region(
1181 GuestAddress(mapping.address),
1182 Box::new(memory_mapping),
1183 !mapping.writable,
1184 /* log_dirty_pages = */ false,
1185 MemCacheType::CacheCoherent,
1186 )
1187 .context("failed to configure file-backed mapping")?;
1188 }
1189
1190 Ok(())
1191 }
1192
1193 #[cfg(target_arch = "x86_64")]
1194 /// Collection of devices related to PCI hotplug.
1195 struct HotPlugStub {
1196 /// Map from bus index to hotplug bus.
1197 hotplug_buses: BTreeMap<u8, Arc<Mutex<dyn HotPlugBus>>>,
1198 /// Bus ranges of devices for virtio-iommu.
1199 iommu_bus_ranges: Vec<RangeInclusive<u32>>,
1200 /// Map from gpe index to GpeNotify devices.
1201 gpe_notify_devs: BTreeMap<u32, Arc<Mutex<dyn GpeNotify>>>,
1202 /// Map from bus index to GpeNotify devices.
1203 pme_notify_devs: BTreeMap<u8, Arc<Mutex<dyn PmeNotify>>>,
1204 }
1205
1206 #[cfg(target_arch = "x86_64")]
1207 impl HotPlugStub {
1208 /// Constructs empty HotPlugStub.
new() -> Self1209 fn new() -> Self {
1210 Self {
1211 hotplug_buses: BTreeMap::new(),
1212 iommu_bus_ranges: Vec::new(),
1213 gpe_notify_devs: BTreeMap::new(),
1214 pme_notify_devs: BTreeMap::new(),
1215 }
1216 }
1217 }
1218
1219 #[cfg(target_arch = "x86_64")]
1220 /// Creates PCIE root port with only virtual devices.
1221 ///
1222 /// user doesn't specify host pcie root port which link to this virtual pcie rp,
1223 /// find the empty bus and create a total virtual pcie rp
create_pure_virtual_pcie_root_port( sys_allocator: &mut SystemAllocator, add_control_tube: &mut impl FnMut(AnyControlTube), devices: &mut Vec<(Box<dyn BusDeviceObj>, Option<Minijail>)>, hp_bus_count: u8, ) -> Result<HotPlugStub>1224 fn create_pure_virtual_pcie_root_port(
1225 sys_allocator: &mut SystemAllocator,
1226 add_control_tube: &mut impl FnMut(AnyControlTube),
1227 devices: &mut Vec<(Box<dyn BusDeviceObj>, Option<Minijail>)>,
1228 hp_bus_count: u8,
1229 ) -> Result<HotPlugStub> {
1230 let mut hp_sec_buses = Vec::new();
1231 let mut hp_stub = HotPlugStub::new();
1232 // Create Pcie Root Port for non-root buses, each non-root bus device will be
1233 // connected behind a virtual pcie root port.
1234 for i in 1..255 {
1235 if sys_allocator.pci_bus_empty(i) {
1236 if hp_sec_buses.len() < hp_bus_count.into() {
1237 hp_sec_buses.push(i);
1238 }
1239 continue;
1240 }
1241 let pcie_root_port = Arc::new(Mutex::new(PcieRootPort::new(i, false)));
1242 hp_stub
1243 .pme_notify_devs
1244 .insert(i, pcie_root_port.clone() as Arc<Mutex<dyn PmeNotify>>);
1245 let (msi_host_tube, msi_device_tube) = Tube::pair().context("failed to create tube")?;
1246 add_control_tube(AnyControlTube::IrqTube(msi_host_tube));
1247 let pci_bridge = Box::new(PciBridge::new(pcie_root_port.clone(), msi_device_tube));
1248 // no ipc is used if the root port disables hotplug
1249 devices.push((pci_bridge, None));
1250 }
1251
1252 // Create Pcie Root Port for hot-plug
1253 if hp_sec_buses.len() < hp_bus_count.into() {
1254 return Err(anyhow!("no more addresses are available"));
1255 }
1256
1257 for hp_sec_bus in hp_sec_buses {
1258 let pcie_root_port = Arc::new(Mutex::new(PcieRootPort::new(hp_sec_bus, true)));
1259 hp_stub.pme_notify_devs.insert(
1260 hp_sec_bus,
1261 pcie_root_port.clone() as Arc<Mutex<dyn PmeNotify>>,
1262 );
1263 let (msi_host_tube, msi_device_tube) = Tube::pair().context("failed to create tube")?;
1264 add_control_tube(AnyControlTube::IrqTube(msi_host_tube));
1265 let pci_bridge = Box::new(PciBridge::new(pcie_root_port.clone(), msi_device_tube));
1266
1267 hp_stub.iommu_bus_ranges.push(RangeInclusive::new(
1268 PciAddress {
1269 bus: pci_bridge.get_secondary_num(),
1270 dev: 0,
1271 func: 0,
1272 }
1273 .to_u32(),
1274 PciAddress {
1275 bus: pci_bridge.get_subordinate_num(),
1276 dev: 32,
1277 func: 8,
1278 }
1279 .to_u32(),
1280 ));
1281
1282 devices.push((pci_bridge, None));
1283 hp_stub
1284 .hotplug_buses
1285 .insert(hp_sec_bus, pcie_root_port as Arc<Mutex<dyn HotPlugBus>>);
1286 }
1287 Ok(hp_stub)
1288 }
1289
setup_vm_components(cfg: &Config) -> Result<VmComponents>1290 fn setup_vm_components(cfg: &Config) -> Result<VmComponents> {
1291 let initrd_image = if let Some(initrd_path) = &cfg.initrd_path {
1292 Some(
1293 open_file_or_duplicate(initrd_path, OpenOptions::new().read(true))
1294 .with_context(|| format!("failed to open initrd {}", initrd_path.display()))?,
1295 )
1296 } else {
1297 None
1298 };
1299 let pvm_fw_image = if let Some(pvm_fw_path) = &cfg.pvm_fw {
1300 Some(
1301 open_file_or_duplicate(pvm_fw_path, OpenOptions::new().read(true))
1302 .with_context(|| format!("failed to open pvm_fw {}", pvm_fw_path.display()))?,
1303 )
1304 } else {
1305 None
1306 };
1307
1308 let vm_image = match cfg.executable_path {
1309 Some(Executable::Kernel(ref kernel_path)) => VmImage::Kernel(
1310 open_file_or_duplicate(kernel_path, OpenOptions::new().read(true)).with_context(
1311 || format!("failed to open kernel image {}", kernel_path.display()),
1312 )?,
1313 ),
1314 Some(Executable::Bios(ref bios_path)) => VmImage::Bios(
1315 open_file_or_duplicate(bios_path, OpenOptions::new().read(true))
1316 .with_context(|| format!("failed to open bios {}", bios_path.display()))?,
1317 ),
1318 _ => panic!("Did not receive a bios or kernel, should be impossible."),
1319 };
1320
1321 let swiotlb = if let Some(size) = cfg.swiotlb {
1322 Some(
1323 size.checked_mul(1024 * 1024)
1324 .ok_or_else(|| anyhow!("requested swiotlb size too large"))?,
1325 )
1326 } else if matches!(cfg.protection_type, ProtectionType::Unprotected) {
1327 None
1328 } else {
1329 Some(64 * 1024 * 1024)
1330 };
1331
1332 let (pflash_image, pflash_block_size) = if let Some(pflash_parameters) = &cfg.pflash_parameters
1333 {
1334 (
1335 Some(
1336 open_file_or_duplicate(
1337 &pflash_parameters.path,
1338 OpenOptions::new().read(true).write(true),
1339 )
1340 .with_context(|| {
1341 format!("failed to open pflash {}", pflash_parameters.path.display())
1342 })?,
1343 ),
1344 pflash_parameters.block_size,
1345 )
1346 } else {
1347 (None, 0)
1348 };
1349
1350 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
1351 let mut cpu_frequencies = BTreeMap::new();
1352 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
1353 let mut normalized_cpu_ipc_ratios = BTreeMap::new();
1354
1355 // if --enable-fw-cfg or --fw-cfg was given, we want to enable fw_cfg
1356 let fw_cfg_enable = cfg.enable_fw_cfg || !cfg.fw_cfg_parameters.is_empty();
1357 let (cpu_clusters, cpu_capacity) = if cfg.host_cpu_topology {
1358 (
1359 Arch::get_host_cpu_clusters()?,
1360 Arch::get_host_cpu_capacity()?,
1361 )
1362 } else {
1363 (cfg.cpu_clusters.clone(), cfg.cpu_capacity.clone())
1364 };
1365
1366 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
1367 let mut vcpu_domain_paths = BTreeMap::new();
1368 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
1369 let mut vcpu_domains = BTreeMap::new();
1370
1371 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
1372 if cfg.virt_cpufreq || cfg.virt_cpufreq_v2 {
1373 if !cfg.cpu_frequencies_khz.is_empty() {
1374 cpu_frequencies = cfg.cpu_frequencies_khz.clone();
1375 } else {
1376 match Arch::get_host_cpu_frequencies_khz() {
1377 Ok(host_cpu_frequencies) => {
1378 for cpu_id in 0..cfg.vcpu_count.unwrap_or(1) {
1379 let vcpu_affinity = match cfg.vcpu_affinity.clone() {
1380 Some(VcpuAffinity::Global(v)) => v,
1381 Some(VcpuAffinity::PerVcpu(mut m)) => {
1382 m.remove(&cpu_id).unwrap_or_default()
1383 }
1384 None => {
1385 panic!("There must be some vcpu_affinity setting with VirtCpufreq enabled!")
1386 }
1387 };
1388
1389 // Check that the physical CPUs that the vCPU is affined to all share the
1390 // same frequency domain.
1391 if let Some(freq_domain) = host_cpu_frequencies.get(&vcpu_affinity[0]) {
1392 for cpu in vcpu_affinity.iter() {
1393 if let Some(frequencies) = host_cpu_frequencies.get(cpu) {
1394 if frequencies != freq_domain {
1395 panic!("Affined CPUs do not share a frequency domain!");
1396 }
1397 }
1398 }
1399 cpu_frequencies.insert(cpu_id, freq_domain.clone());
1400 } else {
1401 panic!("No frequency domain for cpu:{}", cpu_id);
1402 }
1403 }
1404 }
1405 Err(e) => {
1406 warn!("Unable to get host cpu frequencies {:#}", e);
1407 }
1408 }
1409 }
1410
1411 if !cpu_frequencies.is_empty() {
1412 let host_max_freqs = Arch::get_host_cpu_max_freq_khz()?;
1413 // Find the highest maximum frequency over all host CPUs. The guest CPU IPC ratios will
1414 // be normalized by dividing by this value.
1415 let host_max_freq = host_max_freqs.values().copied().max().unwrap_or_default();
1416
1417 normalized_cpu_ipc_ratios = normalize_cpu_ipc_ratios(
1418 cpu_frequencies.iter().map(|(cpu_id, frequencies)| {
1419 (
1420 *cpu_id,
1421 frequencies.iter().copied().max().unwrap_or_default(),
1422 )
1423 }),
1424 host_max_freq,
1425 |cpu_id| cfg.cpu_ipc_ratio.get(&cpu_id).copied().unwrap_or(1024),
1426 )?;
1427
1428 if !cfg.cpu_freq_domains.is_empty() {
1429 let cgroup_path = cfg
1430 .vcpu_cgroup_path
1431 .clone()
1432 .context("cpu_freq_domains requires vcpu_cgroup_path")?;
1433
1434 if !cgroup_path.join("cgroup.controllers").exists() {
1435 panic!("CGroupsV2 must be enabled for cpu freq domain support!");
1436 }
1437
1438 // Assign parent crosvm process to top level cgroup
1439 let cgroup_procs_path = cgroup_path.join("cgroup.procs");
1440 std::fs::write(
1441 cgroup_procs_path.clone(),
1442 process::id().to_string().as_bytes(),
1443 )
1444 .with_context(|| {
1445 format!(
1446 "failed to create vcpu-cgroup-path {}",
1447 cgroup_procs_path.display(),
1448 )
1449 })?;
1450
1451 for (freq_domain_idx, cpus) in cfg.cpu_freq_domains.iter().enumerate() {
1452 let vcpu_domain_path =
1453 cgroup_path.join(format!("vcpu-domain{}", freq_domain_idx));
1454 // Create subtree for domain
1455 create_dir_all(&vcpu_domain_path)?;
1456
1457 // Set vcpu_domain cgroup type as 'threaded' to get thread level granularity
1458 // controls
1459 let cgroup_type_path = cgroup_path.join(vcpu_domain_path.join("cgroup.type"));
1460 std::fs::write(cgroup_type_path.clone(), b"threaded").with_context(|| {
1461 format!(
1462 "failed to create vcpu-cgroup-path {}",
1463 cgroup_type_path.display(),
1464 )
1465 })?;
1466 for core_idx in cpus.iter() {
1467 vcpu_domain_paths.insert(*core_idx, vcpu_domain_path.clone());
1468 vcpu_domains.insert(*core_idx, freq_domain_idx as u32);
1469 }
1470 }
1471 }
1472 }
1473 }
1474
1475 Ok(VmComponents {
1476 #[cfg(target_arch = "x86_64")]
1477 ac_adapter: cfg.ac_adapter,
1478 #[cfg(target_arch = "x86_64")]
1479 break_linux_pci_config_io: cfg.break_linux_pci_config_io,
1480 memory_size: cfg
1481 .memory
1482 .unwrap_or(256)
1483 .checked_mul(1024 * 1024)
1484 .ok_or_else(|| anyhow!("requested memory size too large"))?,
1485 swiotlb,
1486 fw_cfg_enable,
1487 bootorder_fw_cfg_blob: Vec::new(),
1488 vcpu_count: cfg.vcpu_count.unwrap_or(1),
1489 vcpu_affinity: cfg.vcpu_affinity.clone(),
1490 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
1491 vcpu_domains,
1492 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
1493 vcpu_domain_paths,
1494 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
1495 cpu_frequencies,
1496 fw_cfg_parameters: cfg.fw_cfg_parameters.clone(),
1497 cpu_clusters,
1498 cpu_capacity,
1499 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
1500 normalized_cpu_ipc_ratios,
1501 no_smt: cfg.no_smt,
1502 hugepages: cfg.hugepages,
1503 hv_cfg: hypervisor::Config {
1504 #[cfg(target_arch = "aarch64")]
1505 mte: cfg.mte,
1506 protection_type: cfg.protection_type,
1507 },
1508 vm_image,
1509 android_fstab: cfg
1510 .android_fstab
1511 .as_ref()
1512 .map(|x| {
1513 File::open(x)
1514 .with_context(|| format!("failed to open android fstab file {}", x.display()))
1515 })
1516 .map_or(Ok(None), |v| v.map(Some))?,
1517 pstore: cfg.pstore.clone(),
1518 pflash_block_size,
1519 pflash_image,
1520 initrd_image,
1521 extra_kernel_params: cfg.params.clone(),
1522 acpi_sdts: cfg
1523 .acpi_tables
1524 .iter()
1525 .map(|path| {
1526 SDT::from_file(path)
1527 .with_context(|| format!("failed to open ACPI file {}", path.display()))
1528 })
1529 .collect::<Result<Vec<SDT>>>()?,
1530 rt_cpus: cfg.rt_cpus.clone(),
1531 delay_rt: cfg.delay_rt,
1532 no_i8042: cfg.no_i8042,
1533 no_rtc: cfg.no_rtc,
1534 #[cfg(target_arch = "x86_64")]
1535 smbios: cfg.smbios.clone(),
1536 host_cpu_topology: cfg.host_cpu_topology,
1537 itmt: cfg.itmt,
1538 #[cfg(target_arch = "x86_64")]
1539 force_s2idle: cfg.force_s2idle,
1540 pvm_fw: pvm_fw_image,
1541 pci_config: cfg.pci_config,
1542 dynamic_power_coefficient: cfg.dynamic_power_coefficient.clone(),
1543 boot_cpu: cfg.boot_cpu,
1544 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
1545 virt_cpufreq_v2: cfg.virt_cpufreq_v2,
1546 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
1547 sve_config: cfg.sve.unwrap_or_default(),
1548 })
1549 }
1550
1551 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
normalize_cpu_ipc_ratios( max_frequency_per_cpu: impl Iterator<Item = (usize, u32)>, host_max_freq: u32, cpu_ipc_ratio: impl Fn(usize) -> u32, ) -> Result<BTreeMap<usize, u32>>1552 fn normalize_cpu_ipc_ratios(
1553 max_frequency_per_cpu: impl Iterator<Item = (usize, u32)>,
1554 host_max_freq: u32,
1555 cpu_ipc_ratio: impl Fn(usize) -> u32,
1556 ) -> Result<BTreeMap<usize, u32>> {
1557 if host_max_freq == 0 {
1558 return Err(anyhow!("invalid host_max_freq 0"));
1559 }
1560
1561 let host_max_freq = u64::from(host_max_freq);
1562 let mut normalized_cpu_ipc_ratios = BTreeMap::new();
1563 for (cpu_id, max_freq) in max_frequency_per_cpu {
1564 let ipc_ratio = u64::from(cpu_ipc_ratio(cpu_id));
1565 let max_freq = u64::from(max_freq);
1566
1567 let normalized_cpu_ipc_ratio = (ipc_ratio * max_freq) / host_max_freq;
1568
1569 normalized_cpu_ipc_ratios.insert(
1570 cpu_id,
1571 u32::try_from(normalized_cpu_ipc_ratio)
1572 .context("normalized CPU IPC ratio out of u32 range")?,
1573 );
1574 }
1575
1576 Ok(normalized_cpu_ipc_ratios)
1577 }
1578
1579 #[derive(Copy, Clone, Debug, Eq, PartialEq)]
1580 pub enum ExitState {
1581 Reset,
1582 Stop,
1583 Crash,
1584 GuestPanic,
1585 WatchdogReset,
1586 }
1587
1588 // Replaces ranges in `guest_mem_layout` that overlap with ranges in `file_backed_mappings`.
1589 // Returns the updated guest memory layout.
punch_holes_in_guest_mem_layout_for_mappings( guest_mem_layout: Vec<(GuestAddress, u64, MemoryRegionOptions)>, file_backed_mappings_ram: &[FileBackedMappingParameters], ) -> Result<Vec<(GuestAddress, u64, MemoryRegionOptions)>>1590 fn punch_holes_in_guest_mem_layout_for_mappings(
1591 guest_mem_layout: Vec<(GuestAddress, u64, MemoryRegionOptions)>,
1592 file_backed_mappings_ram: &[FileBackedMappingParameters],
1593 ) -> Result<Vec<(GuestAddress, u64, MemoryRegionOptions)>> {
1594 // Create a set containing (start, end) pairs with exclusive end (end = start + size; the byte
1595 // at end is not included in the range).
1596 let mut layout_set = BTreeSet::new();
1597 for (addr, size, options) in &guest_mem_layout {
1598 layout_set.insert((addr.offset(), addr.offset() + size, options.clone()));
1599 }
1600
1601 // Make sure the RAM mappings are a subset of the RAM memory layout.
1602 // For simplicity, we currently require each mapping to be fully contained within a single
1603 // region of the input layout.
1604 for mapping in file_backed_mappings_ram {
1605 anyhow::ensure!(
1606 layout_set
1607 .iter()
1608 .any(|(addr, size, _)| *addr <= mapping.address
1609 && mapping.address + mapping.size <= *addr + *size),
1610 "RAM file-backed-mapping must be a subset of a RAM region"
1611 );
1612 }
1613
1614 for mapping in file_backed_mappings_ram.iter().cloned() {
1615 let mapping_start = mapping.address;
1616 let mapping_end = mapping_start + mapping.size;
1617 let mut purpose = None;
1618 // Repeatedly split overlapping guest memory regions until no overlaps remain.
1619 while let Some((range_start, range_end, options)) = layout_set
1620 .iter()
1621 .find(|&&(range_start, range_end, _)| {
1622 mapping_start < range_end && mapping_end > range_start
1623 })
1624 .cloned()
1625 {
1626 let purpose = *purpose.get_or_insert(options.purpose);
1627 anyhow::ensure!(
1628 options.purpose == purpose,
1629 "RAM file-backed-mapping cannot span regions with different purposes: {:?} vs {:?}",
1630 options.purpose,
1631 purpose
1632 );
1633
1634 layout_set.remove(&(range_start, range_end, options.clone()));
1635
1636 if range_start < mapping_start {
1637 layout_set.insert((range_start, mapping_start, options.clone()));
1638 }
1639 if range_end > mapping_end {
1640 layout_set.insert((mapping_end, range_end, options));
1641 }
1642 }
1643 layout_set.insert((
1644 mapping_start,
1645 mapping_end,
1646 MemoryRegionOptions::new()
1647 .purpose(purpose.unwrap())
1648 .file_backed(mapping),
1649 ));
1650 }
1651
1652 // Build the final guest memory layout from the modified layout_set.
1653 Ok(layout_set
1654 .into_iter()
1655 .map(|(start, end, options)| (GuestAddress(start), end - start, options))
1656 .collect())
1657 }
1658
create_guest_memory( cfg: &Config, components: &VmComponents, arch_memory_layout: &<Arch as LinuxArch>::ArchMemoryLayout, hypervisor: &impl Hypervisor, ) -> Result<GuestMemory>1659 fn create_guest_memory(
1660 cfg: &Config,
1661 components: &VmComponents,
1662 arch_memory_layout: &<Arch as LinuxArch>::ArchMemoryLayout,
1663 hypervisor: &impl Hypervisor,
1664 ) -> Result<GuestMemory> {
1665 let guest_mem_layout = Arch::guest_memory_layout(components, arch_memory_layout, hypervisor)
1666 .context("failed to create guest memory layout")?;
1667
1668 let guest_mem_layout = punch_holes_in_guest_mem_layout_for_mappings(
1669 guest_mem_layout,
1670 &cfg.file_backed_mappings_ram,
1671 )?;
1672
1673 let mut guest_mem = GuestMemory::new_with_options(&guest_mem_layout)
1674 .context("failed to create guest memory")?;
1675 let mut mem_policy = MemoryPolicy::empty();
1676 if components.hugepages {
1677 mem_policy |= MemoryPolicy::USE_HUGEPAGES;
1678 }
1679
1680 if cfg.lock_guest_memory {
1681 mem_policy |= MemoryPolicy::LOCK_GUEST_MEMORY;
1682 }
1683 guest_mem.set_memory_policy(mem_policy);
1684
1685 if cfg.unmap_guest_memory_on_fork {
1686 // Note that this isn't compatible with sandboxing. We could potentially fix that by
1687 // delaying the call until after the sandboxed devices are forked. However, the main use
1688 // for this is in conjunction with protected VMs, where most of the guest memory has been
1689 // unshared with the host. We'd need to be confident that the guest memory is unshared with
1690 // the host only after the `use_dontfork` call and those details will vary by hypervisor.
1691 // So, for now we keep things simple to be safe.
1692 guest_mem.use_dontfork().context("use_dontfork failed")?;
1693 }
1694
1695 Ok(guest_mem)
1696 }
1697
1698 #[cfg(all(target_arch = "aarch64", feature = "geniezone"))]
run_gz(device_path: Option<&Path>, cfg: Config, components: VmComponents) -> Result<ExitState>1699 fn run_gz(device_path: Option<&Path>, cfg: Config, components: VmComponents) -> Result<ExitState> {
1700 use devices::GeniezoneKernelIrqChip;
1701 use hypervisor::geniezone::Geniezone;
1702 use hypervisor::geniezone::GeniezoneVcpu;
1703 use hypervisor::geniezone::GeniezoneVm;
1704
1705 let device_path = device_path.unwrap_or(Path::new(GENIEZONE_PATH));
1706 let gzvm = Geniezone::new_with_path(device_path)
1707 .with_context(|| format!("failed to open GenieZone device {}", device_path.display()))?;
1708
1709 let arch_memory_layout =
1710 Arch::arch_memory_layout(&components).context("failed to create arch memory layout")?;
1711 let guest_mem = create_guest_memory(&cfg, &components, &arch_memory_layout, &gzvm)?;
1712
1713 #[cfg(feature = "swap")]
1714 let swap_controller = if let Some(swap_dir) = cfg.swap_dir.as_ref() {
1715 Some(
1716 SwapController::launch(guest_mem.clone(), swap_dir, cfg.jail_config.as_ref())
1717 .context("launch vmm-swap monitor process")?,
1718 )
1719 } else {
1720 None
1721 };
1722
1723 let vm =
1724 GeniezoneVm::new(&gzvm, guest_mem, components.hv_cfg).context("failed to create vm")?;
1725
1726 // Check that the VM was actually created in protected mode as expected.
1727 if cfg.protection_type.isolates_memory() && !vm.check_capability(VmCap::Protected) {
1728 bail!("Failed to create protected VM");
1729 }
1730 let vm_clone = vm.try_clone().context("failed to clone vm")?;
1731
1732 let ioapic_host_tube;
1733 let mut irq_chip = match cfg.irq_chip.unwrap_or(IrqChipKind::Kernel) {
1734 IrqChipKind::Split => bail!("Geniezone does not support split irqchip mode"),
1735 IrqChipKind::Userspace => bail!("Geniezone does not support userspace irqchip mode"),
1736 IrqChipKind::Kernel => {
1737 ioapic_host_tube = None;
1738 GeniezoneKernelIrqChip::new(vm_clone, components.vcpu_count)
1739 .context("failed to create IRQ chip")?
1740 }
1741 };
1742
1743 run_vm::<GeniezoneVcpu, GeniezoneVm>(
1744 cfg,
1745 components,
1746 &arch_memory_layout,
1747 vm,
1748 &mut irq_chip,
1749 ioapic_host_tube,
1750 #[cfg(feature = "swap")]
1751 swap_controller,
1752 )
1753 }
1754
run_kvm(device_path: Option<&Path>, cfg: Config, components: VmComponents) -> Result<ExitState>1755 fn run_kvm(device_path: Option<&Path>, cfg: Config, components: VmComponents) -> Result<ExitState> {
1756 use devices::KvmKernelIrqChip;
1757 #[cfg(target_arch = "x86_64")]
1758 use devices::KvmSplitIrqChip;
1759 use hypervisor::kvm::Kvm;
1760 use hypervisor::kvm::KvmVcpu;
1761 use hypervisor::kvm::KvmVm;
1762
1763 let device_path = device_path.unwrap_or(Path::new(KVM_PATH));
1764 let kvm = Kvm::new_with_path(device_path)
1765 .with_context(|| format!("failed to open KVM device {}", device_path.display()))?;
1766
1767 let arch_memory_layout =
1768 Arch::arch_memory_layout(&components).context("failed to create arch memory layout")?;
1769 let guest_mem = create_guest_memory(&cfg, &components, &arch_memory_layout, &kvm)?;
1770
1771 #[cfg(feature = "swap")]
1772 let swap_controller = if let Some(swap_dir) = cfg.swap_dir.as_ref() {
1773 Some(
1774 SwapController::launch(guest_mem.clone(), swap_dir, cfg.jail_config.as_ref())
1775 .context("launch vmm-swap monitor process")?,
1776 )
1777 } else {
1778 None
1779 };
1780
1781 let vm = KvmVm::new(&kvm, guest_mem, components.hv_cfg).context("failed to create vm")?;
1782
1783 #[cfg(target_arch = "x86_64")]
1784 if cfg.itmt {
1785 vm.set_platform_info_read_access(false)
1786 .context("failed to disable MSR_PLATFORM_INFO read access")?;
1787 }
1788
1789 // Check that the VM was actually created in protected mode as expected.
1790 // This check is only needed on aarch64. On x86_64, protected VM creation will fail
1791 // if protected mode is not supported.
1792 #[cfg(not(target_arch = "x86_64"))]
1793 if cfg.protection_type.isolates_memory() && !vm.check_capability(VmCap::Protected) {
1794 bail!("Failed to create protected VM");
1795 }
1796 let vm_clone = vm.try_clone().context("failed to clone vm")?;
1797
1798 enum KvmIrqChip {
1799 #[cfg(target_arch = "x86_64")]
1800 Split(KvmSplitIrqChip),
1801 Kernel(KvmKernelIrqChip),
1802 }
1803
1804 impl KvmIrqChip {
1805 fn as_mut(&mut self) -> &mut dyn IrqChipArch {
1806 match self {
1807 #[cfg(target_arch = "x86_64")]
1808 KvmIrqChip::Split(i) => i,
1809 KvmIrqChip::Kernel(i) => i,
1810 }
1811 }
1812 }
1813
1814 let ioapic_host_tube;
1815 let mut irq_chip = match cfg.irq_chip.unwrap_or(IrqChipKind::Kernel) {
1816 IrqChipKind::Userspace => {
1817 bail!("KVM userspace irqchip mode not implemented");
1818 }
1819 IrqChipKind::Split => {
1820 #[cfg(not(target_arch = "x86_64"))]
1821 bail!("KVM split irqchip mode only supported on x86 processors");
1822 #[cfg(target_arch = "x86_64")]
1823 {
1824 let (host_tube, ioapic_device_tube) =
1825 Tube::pair().context("failed to create tube")?;
1826 ioapic_host_tube = Some(host_tube);
1827 KvmIrqChip::Split(
1828 KvmSplitIrqChip::new(
1829 vm_clone,
1830 components.vcpu_count,
1831 ioapic_device_tube,
1832 Some(24),
1833 )
1834 .context("failed to create IRQ chip")?,
1835 )
1836 }
1837 }
1838 IrqChipKind::Kernel => {
1839 ioapic_host_tube = None;
1840 KvmIrqChip::Kernel(
1841 KvmKernelIrqChip::new(vm_clone, components.vcpu_count)
1842 .context("failed to create IRQ chip")?,
1843 )
1844 }
1845 };
1846
1847 run_vm::<KvmVcpu, KvmVm>(
1848 cfg,
1849 components,
1850 &arch_memory_layout,
1851 vm,
1852 irq_chip.as_mut(),
1853 ioapic_host_tube,
1854 #[cfg(feature = "swap")]
1855 swap_controller,
1856 )
1857 }
1858
1859 #[cfg(all(any(target_arch = "arm", target_arch = "aarch64"), feature = "gunyah"))]
run_gunyah( device_path: Option<&Path>, qcom_trusted_vm_id: Option<u16>, qcom_trusted_vm_pas_id: Option<u32>, cfg: Config, components: VmComponents, ) -> Result<ExitState>1860 fn run_gunyah(
1861 device_path: Option<&Path>,
1862 qcom_trusted_vm_id: Option<u16>,
1863 qcom_trusted_vm_pas_id: Option<u32>,
1864 cfg: Config,
1865 components: VmComponents,
1866 ) -> Result<ExitState> {
1867 use devices::GunyahIrqChip;
1868 use hypervisor::gunyah::Gunyah;
1869 use hypervisor::gunyah::GunyahVcpu;
1870 use hypervisor::gunyah::GunyahVm;
1871
1872 let device_path = device_path.unwrap_or(Path::new(GUNYAH_PATH));
1873 let gunyah = Gunyah::new_with_path(device_path)
1874 .with_context(|| format!("failed to open Gunyah device {}", device_path.display()))?;
1875
1876 let arch_memory_layout =
1877 Arch::arch_memory_layout(&components).context("failed to create arch memory layout")?;
1878 let guest_mem = create_guest_memory(&cfg, &components, &arch_memory_layout, &gunyah)?;
1879
1880 #[cfg(feature = "swap")]
1881 let swap_controller = if let Some(swap_dir) = cfg.swap_dir.as_ref() {
1882 Some(
1883 SwapController::launch(guest_mem.clone(), swap_dir, cfg.jail_config.as_ref())
1884 .context("launch vmm-swap monitor process")?,
1885 )
1886 } else {
1887 None
1888 };
1889
1890 let vm = GunyahVm::new(&gunyah, qcom_trusted_vm_id, qcom_trusted_vm_pas_id, guest_mem, components.hv_cfg).context("failed to create vm")?;
1891
1892 // Check that the VM was actually created in protected mode as expected.
1893 if cfg.protection_type.isolates_memory() && !vm.check_capability(VmCap::Protected) {
1894 bail!("Failed to create protected VM");
1895 }
1896
1897 let vm_clone = vm.try_clone()?;
1898
1899 run_vm::<GunyahVcpu, GunyahVm>(
1900 cfg,
1901 components,
1902 &arch_memory_layout,
1903 vm,
1904 &mut GunyahIrqChip::new(vm_clone)?,
1905 None,
1906 #[cfg(feature = "swap")]
1907 swap_controller,
1908 )
1909 }
1910
1911 /// Choose a default hypervisor if no `--hypervisor` option was specified.
get_default_hypervisor() -> Option<HypervisorKind>1912 fn get_default_hypervisor() -> Option<HypervisorKind> {
1913 let kvm_path = Path::new(KVM_PATH);
1914 if kvm_path.exists() {
1915 return Some(HypervisorKind::Kvm {
1916 device: Some(kvm_path.to_path_buf()),
1917 });
1918 }
1919
1920 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
1921 #[cfg(feature = "geniezone")]
1922 {
1923 let gz_path = Path::new(GENIEZONE_PATH);
1924 if gz_path.exists() {
1925 return Some(HypervisorKind::Geniezone {
1926 device: Some(gz_path.to_path_buf()),
1927 });
1928 }
1929 }
1930
1931 #[cfg(all(
1932 unix,
1933 any(target_arch = "arm", target_arch = "aarch64"),
1934 feature = "gunyah"
1935 ))]
1936 {
1937 let gunyah_path = Path::new(GUNYAH_PATH);
1938 if gunyah_path.exists() {
1939 return Some(HypervisorKind::Gunyah {
1940 device: Some(gunyah_path.to_path_buf()),
1941 qcom_trusted_vm_id: None,
1942 qcom_trusted_vm_pas_id: None,
1943 });
1944 }
1945 }
1946
1947 None
1948 }
1949
run_config(cfg: Config) -> Result<ExitState>1950 pub fn run_config(cfg: Config) -> Result<ExitState> {
1951 let components = setup_vm_components(&cfg)?;
1952
1953 let hypervisor = cfg
1954 .hypervisor
1955 .clone()
1956 .or_else(get_default_hypervisor)
1957 .context("no enabled hypervisor")?;
1958
1959 debug!("creating hypervisor: {:?}", hypervisor);
1960
1961 match hypervisor {
1962 HypervisorKind::Kvm { device } => run_kvm(device.as_deref(), cfg, components),
1963 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
1964 #[cfg(feature = "geniezone")]
1965 HypervisorKind::Geniezone { device } => run_gz(device.as_deref(), cfg, components),
1966 #[cfg(all(
1967 unix,
1968 any(target_arch = "arm", target_arch = "aarch64"),
1969 feature = "gunyah"
1970 ))]
1971 HypervisorKind::Gunyah { device,
1972 qcom_trusted_vm_id,
1973 qcom_trusted_vm_pas_id
1974 } => run_gunyah(
1975 device.as_deref(),
1976 qcom_trusted_vm_id,
1977 qcom_trusted_vm_pas_id,
1978 cfg, components),
1979 }
1980 }
1981
run_vm<Vcpu, V>( cfg: Config, #[allow(unused_mut)] mut components: VmComponents, arch_memory_layout: &<Arch as LinuxArch>::ArchMemoryLayout, mut vm: V, irq_chip: &mut dyn IrqChipArch, ioapic_host_tube: Option<Tube>, #[cfg(feature = "swap")] mut swap_controller: Option<SwapController>, ) -> Result<ExitState> where Vcpu: VcpuArch + 'static, V: VmArch + 'static,1982 fn run_vm<Vcpu, V>(
1983 cfg: Config,
1984 #[allow(unused_mut)] mut components: VmComponents,
1985 arch_memory_layout: &<Arch as LinuxArch>::ArchMemoryLayout,
1986 mut vm: V,
1987 irq_chip: &mut dyn IrqChipArch,
1988 ioapic_host_tube: Option<Tube>,
1989 #[cfg(feature = "swap")] mut swap_controller: Option<SwapController>,
1990 ) -> Result<ExitState>
1991 where
1992 Vcpu: VcpuArch + 'static,
1993 V: VmArch + 'static,
1994 {
1995 if cfg.jail_config.is_some() {
1996 // Printing something to the syslog before entering minijail so that libc's syslogger has a
1997 // chance to open files necessary for its operation, like `/etc/localtime`. After jailing,
1998 // access to those files will not be possible.
1999 info!("crosvm entering multiprocess mode");
2000 }
2001
2002 let (metrics_send, metrics_recv) = Tube::directional_pair().context("metrics tube")?;
2003 metrics::initialize(metrics_send);
2004
2005 #[cfg(all(feature = "pci-hotplug", feature = "swap"))]
2006 let swap_device_helper = match &swap_controller {
2007 Some(swap_controller) => Some(swap_controller.create_device_helper()?),
2008 None => None,
2009 };
2010 // pci-hotplug is only implemented for x86_64 for now, attempting to use it on other platform
2011 // would crash.
2012 #[cfg(all(feature = "pci-hotplug", not(target_arch = "x86_64")))]
2013 if cfg.pci_hotplug_slots.is_some() {
2014 bail!("pci-hotplug is not implemented for non x86_64 architecture");
2015 }
2016 // hotplug_manager must be created before vm is started since it forks jail warden process.
2017 #[cfg(feature = "pci-hotplug")]
2018 // TODO(293801301): Remove unused_mut after aarch64 support
2019 #[allow(unused_mut)]
2020 let mut hotplug_manager = if cfg.pci_hotplug_slots.is_some() {
2021 Some(PciHotPlugManager::new(
2022 vm.get_memory().clone(),
2023 &cfg,
2024 #[cfg(feature = "swap")]
2025 swap_device_helper,
2026 )?)
2027 } else {
2028 None
2029 };
2030
2031 #[cfg(feature = "usb")]
2032 let (usb_control_tube, usb_provider) =
2033 DeviceProvider::new().context("failed to create usb provider")?;
2034
2035 // Masking signals is inherently dangerous, since this can persist across clones/execs. Do this
2036 // before any jailed devices have been spawned, so that we can catch any of them that fail very
2037 // quickly.
2038 let sigchld_fd = SignalFd::new(libc::SIGCHLD).context("failed to create signalfd")?;
2039
2040 let control_server_socket = match &cfg.socket_path {
2041 Some(path) => Some(UnlinkUnixSeqpacketListener(
2042 UnixSeqpacketListener::bind(path).context("failed to create control server")?,
2043 )),
2044 None => None,
2045 };
2046
2047 let mut all_control_tubes = Vec::new();
2048 let mut add_control_tube = |t| all_control_tubes.push(t);
2049
2050 if let Some(ioapic_host_tube) = ioapic_host_tube {
2051 add_control_tube(AnyControlTube::IrqTube(ioapic_host_tube));
2052 }
2053
2054 let battery = if cfg.battery_config.is_some() {
2055 #[cfg_attr(
2056 not(feature = "power-monitor-powerd"),
2057 allow(clippy::manual_map, clippy::needless_match, unused_mut)
2058 )]
2059 let jail = if let Some(jail_config) = cfg.jail_config.as_ref() {
2060 let mut config = SandboxConfig::new(jail_config, "battery");
2061 #[cfg(feature = "power-monitor-powerd")]
2062 {
2063 config.bind_mounts = true;
2064 }
2065 let mut jail =
2066 create_sandbox_minijail(&jail_config.pivot_root, MAX_OPEN_FILES_DEFAULT, &config)?;
2067
2068 // Setup a bind mount to the system D-Bus socket if the powerd monitor is used.
2069 #[cfg(feature = "power-monitor-powerd")]
2070 {
2071 let system_bus_socket_path = Path::new("/run/dbus/system_bus_socket");
2072 jail.mount_bind(system_bus_socket_path, system_bus_socket_path, true)?;
2073 }
2074 Some(jail)
2075 } else {
2076 None
2077 };
2078 (cfg.battery_config.as_ref().map(|c| c.type_), jail)
2079 } else {
2080 (cfg.battery_config.as_ref().map(|c| c.type_), None)
2081 };
2082
2083 let (vm_evt_wrtube, vm_evt_rdtube) =
2084 Tube::directional_pair().context("failed to create vm event tube")?;
2085
2086 let pstore_size = components.pstore.as_ref().map(|pstore| pstore.size as u64);
2087 let mut sys_allocator = SystemAllocator::new(
2088 Arch::get_system_allocator_config(&vm, arch_memory_layout),
2089 pstore_size,
2090 &cfg.mmio_address_ranges,
2091 )
2092 .context("failed to create system allocator")?;
2093
2094 let ramoops_region = match &components.pstore {
2095 Some(pstore) => Some(
2096 arch::pstore::create_memory_region(
2097 &mut vm,
2098 sys_allocator.reserved_region().unwrap(),
2099 pstore,
2100 )
2101 .context("failed to allocate pstore region")?,
2102 ),
2103 None => None,
2104 };
2105
2106 create_mmio_file_backed_mappings(&cfg, &mut vm, &mut sys_allocator)?;
2107
2108 #[cfg(feature = "gpu")]
2109 // Hold on to the render server jail so it keeps running until we exit run_vm()
2110 let (_render_server_jail, render_server_fd) =
2111 if let Some(parameters) = &cfg.gpu_render_server_parameters {
2112 let (jail, fd) = start_gpu_render_server(&cfg, parameters)?;
2113 (Some(ScopedMinijail(jail)), Some(fd))
2114 } else {
2115 (None, None)
2116 };
2117
2118 let mut iommu_attached_endpoints: BTreeMap<u32, Arc<Mutex<Box<dyn MemoryMapperTrait>>>> =
2119 BTreeMap::new();
2120 let mut iova_max_addr: Option<u64> = None;
2121
2122 let mut vfio_container_manager = VfioContainerManager::new();
2123
2124 #[cfg(feature = "registered_events")]
2125 let (reg_evt_wrtube, reg_evt_rdtube) =
2126 Tube::directional_pair().context("failed to create registered event tube")?;
2127
2128 let mut worker_process_pids = BTreeSet::new();
2129
2130 let mut devices = create_devices(
2131 &cfg,
2132 &mut vm,
2133 &mut sys_allocator,
2134 &mut add_control_tube,
2135 &vm_evt_wrtube,
2136 &mut iommu_attached_endpoints,
2137 #[cfg(feature = "usb")]
2138 usb_provider,
2139 #[cfg(feature = "gpu")]
2140 render_server_fd,
2141 &mut iova_max_addr,
2142 #[cfg(feature = "registered_events")]
2143 ®_evt_wrtube,
2144 &mut vfio_container_manager,
2145 &mut worker_process_pids,
2146 )?;
2147
2148 #[cfg(feature = "pci-hotplug")]
2149 // TODO(293801301): Remove unused_variables after aarch64 support
2150 #[allow(unused_variables)]
2151 let pci_hotplug_slots = cfg.pci_hotplug_slots;
2152 #[cfg(not(feature = "pci-hotplug"))]
2153 #[allow(unused_variables)]
2154 let pci_hotplug_slots: Option<u8> = None;
2155 #[cfg(target_arch = "x86_64")]
2156 let hp_stub = create_pure_virtual_pcie_root_port(
2157 &mut sys_allocator,
2158 &mut add_control_tube,
2159 &mut devices,
2160 pci_hotplug_slots.unwrap_or(1),
2161 )?;
2162
2163 arch::assign_pci_addresses(&mut devices, &mut sys_allocator)?;
2164
2165 let pci_devices: Vec<&dyn PciDevice> = devices
2166 .iter()
2167 .filter_map(|d| (d.0).as_pci_device())
2168 .collect();
2169
2170 let virtio_devices: Vec<(&dyn VirtioDevice, devices::PciAddress)> = pci_devices
2171 .into_iter()
2172 .flat_map(|s| {
2173 if let Some(virtio_pci_device) = s.as_virtio_pci_device() {
2174 std::iter::zip(
2175 Some(virtio_pci_device.virtio_device()),
2176 virtio_pci_device.pci_address(),
2177 )
2178 .next()
2179 } else {
2180 None
2181 }
2182 })
2183 .collect();
2184
2185 let mut open_firmware_device_paths: Vec<(Vec<u8>, usize)> = virtio_devices
2186 .iter()
2187 .flat_map(|s| (s.0).bootorder_fw_cfg(s.1.dev))
2188 .collect();
2189
2190 // order the OpenFirmware device paths, in ascending order, by their boot_index
2191 open_firmware_device_paths.sort_by(|a, b| (a.1).cmp(&(b.1)));
2192
2193 // "/pci@iocf8/" is x86 specific and represents the root at the system bus port
2194 let mut bootorder_fw_cfg_blob =
2195 open_firmware_device_paths
2196 .into_iter()
2197 .fold(Vec::new(), |a, b| {
2198 a.into_iter()
2199 .chain("/pci@i0cf8/".as_bytes().iter().copied())
2200 .chain(b.0)
2201 .chain("\n".as_bytes().iter().copied())
2202 .collect()
2203 });
2204
2205 // the "bootorder" file is expected to end with a null terminator
2206 bootorder_fw_cfg_blob.push(0);
2207
2208 components.bootorder_fw_cfg_blob = bootorder_fw_cfg_blob;
2209
2210 // if the bootindex argument was given, we want to make sure that fw_cfg is enabled so the
2211 // "bootorder" file can be accessed by the guest.
2212 components.fw_cfg_enable |= components.bootorder_fw_cfg_blob.len() > 1;
2213
2214 let (translate_response_senders, request_rx) = setup_virtio_access_platform(
2215 &mut sys_allocator,
2216 &mut iommu_attached_endpoints,
2217 &mut devices,
2218 )?;
2219
2220 #[cfg(target_arch = "x86_64")]
2221 let iommu_bus_ranges = hp_stub.iommu_bus_ranges;
2222 #[cfg(not(target_arch = "x86_64"))]
2223 let iommu_bus_ranges = Vec::new();
2224
2225 let iommu_host_tube = if !iommu_attached_endpoints.is_empty()
2226 || (cfg.vfio_isolate_hotplug && !iommu_bus_ranges.is_empty())
2227 {
2228 let (iommu_host_tube, iommu_device_tube) = Tube::pair().context("failed to create tube")?;
2229 let iommu_dev = create_iommu_device(
2230 cfg.protection_type,
2231 cfg.jail_config.as_ref(),
2232 iova_max_addr.unwrap_or(u64::MAX),
2233 iommu_attached_endpoints,
2234 iommu_bus_ranges,
2235 translate_response_senders,
2236 request_rx,
2237 iommu_device_tube,
2238 )?;
2239
2240 let (msi_host_tube, msi_device_tube) = Tube::pair().context("failed to create tube")?;
2241 add_control_tube(AnyControlTube::IrqTube(msi_host_tube));
2242 let (ioevent_host_tube, ioevent_device_tube) =
2243 Tube::pair().context("failed to create ioevent tube")?;
2244 add_control_tube(
2245 VmMemoryTube {
2246 tube: ioevent_host_tube,
2247 expose_with_viommu: false,
2248 }
2249 .into(),
2250 );
2251 let (host_tube, device_tube) =
2252 Tube::pair().context("failed to create device control tube")?;
2253 add_control_tube(TaggedControlTube::Vm(host_tube).into());
2254 let mut dev = VirtioPciDevice::new(
2255 vm.get_memory().clone(),
2256 iommu_dev.dev,
2257 msi_device_tube,
2258 cfg.disable_virtio_intx,
2259 None,
2260 VmMemoryClient::new(ioevent_device_tube),
2261 device_tube,
2262 )
2263 .context("failed to create virtio pci dev")?;
2264 // early reservation for viommu.
2265 dev.allocate_address(&mut sys_allocator)
2266 .context("failed to allocate resources early for virtio pci dev")?;
2267 let dev = Box::new(dev);
2268 devices.push((dev, iommu_dev.jail));
2269 Some(iommu_host_tube)
2270 } else {
2271 None
2272 };
2273
2274 #[cfg(target_arch = "x86_64")]
2275 for device in devices
2276 .iter_mut()
2277 .filter_map(|(dev, _)| dev.as_pci_device_mut())
2278 {
2279 device.generate_acpi(&mut components.acpi_sdts);
2280 }
2281
2282 // KVM_CREATE_VCPU uses apic id for x86 and uses cpu id for others.
2283 let mut vcpu_ids = Vec::new();
2284
2285 let guest_suspended_cvar = if cfg.force_s2idle {
2286 Some(Arc::new((Mutex::new(false), Condvar::new())))
2287 } else {
2288 None
2289 };
2290
2291 let dt_overlays = cfg
2292 .device_tree_overlay
2293 .iter()
2294 .map(|o| {
2295 Ok(DtbOverlay {
2296 file: open_file_or_duplicate(o.path.as_path(), OpenOptions::new().read(true))
2297 .with_context(|| {
2298 format!("failed to open device tree overlay {}", o.path.display())
2299 })?,
2300 do_filter: o.filter_devs,
2301 })
2302 })
2303 .collect::<Result<Vec<DtbOverlay>>>()?;
2304
2305 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
2306 let vcpu_domain_paths = components.vcpu_domain_paths.clone();
2307
2308 let mut linux = Arch::build_vm::<V, Vcpu>(
2309 components,
2310 arch_memory_layout,
2311 &vm_evt_wrtube,
2312 &mut sys_allocator,
2313 &cfg.serial_parameters,
2314 simple_jail(cfg.jail_config.as_ref(), "serial_device")?,
2315 battery,
2316 vm,
2317 ramoops_region,
2318 devices,
2319 irq_chip,
2320 &mut vcpu_ids,
2321 cfg.dump_device_tree_blob.clone(),
2322 simple_jail(cfg.jail_config.as_ref(), "serial_device")?,
2323 #[cfg(target_arch = "x86_64")]
2324 simple_jail(cfg.jail_config.as_ref(), "block_device")?,
2325 #[cfg(target_arch = "x86_64")]
2326 simple_jail(cfg.jail_config.as_ref(), "fw_cfg_device")?,
2327 #[cfg(feature = "swap")]
2328 &mut swap_controller,
2329 guest_suspended_cvar.clone(),
2330 dt_overlays,
2331 cfg.fdt_position,
2332 cfg.no_pmu,
2333 )
2334 .context("the architecture failed to build the vm")?;
2335
2336 for tube in linux.vm_request_tubes.drain(..) {
2337 add_control_tube(TaggedControlTube::Vm(tube).into());
2338 }
2339
2340 #[cfg(target_arch = "x86_64")]
2341 let (hp_control_tube, hp_worker_tube) = mpsc::channel();
2342 #[cfg(all(feature = "pci-hotplug", target_arch = "x86_64"))]
2343 if let Some(hotplug_manager) = &mut hotplug_manager {
2344 hotplug_manager.set_rootbus_controller(hp_control_tube.clone())?;
2345 }
2346 #[cfg(target_arch = "x86_64")]
2347 let hp_thread = {
2348 for (bus_num, hp_bus) in hp_stub.hotplug_buses.into_iter() {
2349 #[cfg(feature = "pci-hotplug")]
2350 if let Some(hotplug_manager) = &mut hotplug_manager {
2351 hotplug_manager.add_port(hp_bus)?;
2352 } else {
2353 linux.hotplug_bus.insert(bus_num, hp_bus);
2354 }
2355 #[cfg(not(feature = "pci-hotplug"))]
2356 linux.hotplug_bus.insert(bus_num, hp_bus);
2357 }
2358
2359 if let Some(pm) = &linux.pm {
2360 for (gpe, notify_dev) in hp_stub.gpe_notify_devs.into_iter() {
2361 pm.lock().register_gpe_notify_dev(gpe, notify_dev);
2362 }
2363 for (bus, notify_dev) in hp_stub.pme_notify_devs.into_iter() {
2364 pm.lock().register_pme_notify_dev(bus, notify_dev);
2365 }
2366 }
2367
2368 let (hp_vm_mem_host_tube, hp_vm_mem_worker_tube) =
2369 Tube::pair().context("failed to create tube")?;
2370 add_control_tube(
2371 VmMemoryTube {
2372 tube: hp_vm_mem_host_tube,
2373 expose_with_viommu: false,
2374 }
2375 .into(),
2376 );
2377
2378 let supports_readonly_mapping = linux.vm.supports_readonly_mapping();
2379 let pci_root = linux.root_config.clone();
2380 std::thread::Builder::new()
2381 .name("pci_root".to_string())
2382 .spawn(move || {
2383 start_pci_root_worker(
2384 supports_readonly_mapping,
2385 pci_root,
2386 hp_worker_tube,
2387 hp_vm_mem_worker_tube,
2388 )
2389 })?
2390 };
2391
2392 let flags = RutabagaGrallocBackendFlags::new().disable_vulkano();
2393 let gralloc = RutabagaGralloc::new(flags).context("failed to create gralloc")?;
2394
2395 run_control(
2396 linux,
2397 sys_allocator,
2398 cfg,
2399 control_server_socket,
2400 all_control_tubes,
2401 #[cfg(feature = "usb")]
2402 usb_control_tube,
2403 vm_evt_rdtube,
2404 vm_evt_wrtube,
2405 sigchld_fd,
2406 gralloc,
2407 vcpu_ids,
2408 iommu_host_tube,
2409 #[cfg(target_arch = "x86_64")]
2410 hp_control_tube,
2411 #[cfg(target_arch = "x86_64")]
2412 hp_thread,
2413 #[cfg(feature = "pci-hotplug")]
2414 hotplug_manager,
2415 #[cfg(feature = "swap")]
2416 swap_controller,
2417 #[cfg(feature = "registered_events")]
2418 reg_evt_rdtube,
2419 guest_suspended_cvar,
2420 metrics_recv,
2421 vfio_container_manager,
2422 worker_process_pids,
2423 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
2424 vcpu_domain_paths,
2425 )
2426 }
2427
2428 // Hotplug command is facing dead lock issue when it tries to acquire the lock
2429 // for pci root in the vm control thread. Dead lock could happen when the vm
2430 // control thread(Thread A namely) is handling the hotplug command and it tries
2431 // to get the lock for pci root. However, the lock is already hold by another
2432 // device in thread B, which is actively sending an vm control to be handled by
2433 // thread A and waiting for response. However, thread A is blocked on acquiring
2434 // the lock, so dead lock happens. In order to resolve this issue, we add this
2435 // worker thread and push all work that locks pci root to this thread.
2436 #[cfg(target_arch = "x86_64")]
start_pci_root_worker( supports_readonly_mapping: bool, pci_root: Arc<Mutex<PciRoot>>, hp_device_tube: mpsc::Receiver<PciRootCommand>, vm_control_tube: Tube, )2437 fn start_pci_root_worker(
2438 supports_readonly_mapping: bool,
2439 pci_root: Arc<Mutex<PciRoot>>,
2440 hp_device_tube: mpsc::Receiver<PciRootCommand>,
2441 vm_control_tube: Tube,
2442 ) {
2443 struct PciMmioMapperTube {
2444 supports_readonly_mapping: bool,
2445 vm_control_tube: Tube,
2446 registered_regions: BTreeMap<u32, VmMemoryRegionId>,
2447 next_id: u32,
2448 }
2449
2450 impl PciMmioMapper for PciMmioMapperTube {
2451 fn supports_readonly_mapping(&self) -> bool {
2452 self.supports_readonly_mapping
2453 }
2454
2455 fn add_mapping(&mut self, addr: GuestAddress, shmem: &SharedMemory) -> anyhow::Result<u32> {
2456 let shmem = shmem
2457 .try_clone()
2458 .context("failed to create new SharedMemory")?;
2459 self.vm_control_tube
2460 .send(&VmMemoryRequest::RegisterMemory {
2461 source: VmMemorySource::SharedMemory(shmem),
2462 dest: VmMemoryDestination::GuestPhysicalAddress(addr.0),
2463 prot: Protection::read(),
2464 cache: MemCacheType::CacheCoherent,
2465 })
2466 .context("failed to send request")?;
2467 match self.vm_control_tube.recv::<VmMemoryResponse>() {
2468 Ok(VmMemoryResponse::RegisterMemory { region_id, .. }) => {
2469 let cur_id = self.next_id;
2470 self.registered_regions.insert(cur_id, region_id);
2471 self.next_id += 1;
2472 Ok(cur_id)
2473 }
2474 res => bail!("Bad response: {:?}", res),
2475 }
2476 }
2477 }
2478
2479 let mut mapper = PciMmioMapperTube {
2480 supports_readonly_mapping,
2481 vm_control_tube,
2482 registered_regions: BTreeMap::new(),
2483 next_id: 0,
2484 };
2485
2486 loop {
2487 match hp_device_tube.recv() {
2488 Ok(cmd) => match cmd {
2489 PciRootCommand::Add(addr, device) => {
2490 if let Err(e) = pci_root.lock().add_device(addr, device, &mut mapper) {
2491 error!("failed to add hotplugged device to PCI root port: {}", e);
2492 }
2493 }
2494 PciRootCommand::AddBridge(pci_bus) => {
2495 if let Err(e) = pci_root.lock().add_bridge(pci_bus) {
2496 error!("failed to add hotplugged bridge to PCI root port: {}", e);
2497 }
2498 }
2499 PciRootCommand::Remove(addr) => {
2500 pci_root.lock().remove_device(addr);
2501 }
2502 PciRootCommand::Kill => break,
2503 },
2504 Err(e) => {
2505 error!("Error: pci root worker channel closed: {}", e);
2506 break;
2507 }
2508 }
2509 }
2510 }
2511
2512 #[cfg(target_arch = "x86_64")]
get_hp_bus<V: VmArch, Vcpu: VcpuArch>( linux: &RunnableLinuxVm<V, Vcpu>, host_addr: PciAddress, ) -> Result<Arc<Mutex<dyn HotPlugBus>>>2513 fn get_hp_bus<V: VmArch, Vcpu: VcpuArch>(
2514 linux: &RunnableLinuxVm<V, Vcpu>,
2515 host_addr: PciAddress,
2516 ) -> Result<Arc<Mutex<dyn HotPlugBus>>> {
2517 for (_, hp_bus) in linux.hotplug_bus.iter() {
2518 if hp_bus.lock().is_match(host_addr).is_some() {
2519 return Ok(hp_bus.clone());
2520 }
2521 }
2522 Err(anyhow!("Failed to find a suitable hotplug bus"))
2523 }
2524
2525 #[cfg(target_arch = "x86_64")]
add_hotplug_device<V: VmArch, Vcpu: VcpuArch>( linux: &mut RunnableLinuxVm<V, Vcpu>, sys_allocator: &mut SystemAllocator, cfg: &Config, add_control_tube: &mut impl FnMut(AnyControlTube), hp_control_tube: &mpsc::Sender<PciRootCommand>, iommu_host_tube: Option<&Tube>, device: &HotPlugDeviceInfo, #[cfg(feature = "swap")] swap_controller: &mut Option<SwapController>, vfio_container_manager: &mut VfioContainerManager, ) -> Result<()>2526 fn add_hotplug_device<V: VmArch, Vcpu: VcpuArch>(
2527 linux: &mut RunnableLinuxVm<V, Vcpu>,
2528 sys_allocator: &mut SystemAllocator,
2529 cfg: &Config,
2530 add_control_tube: &mut impl FnMut(AnyControlTube),
2531 hp_control_tube: &mpsc::Sender<PciRootCommand>,
2532 iommu_host_tube: Option<&Tube>,
2533 device: &HotPlugDeviceInfo,
2534 #[cfg(feature = "swap")] swap_controller: &mut Option<SwapController>,
2535 vfio_container_manager: &mut VfioContainerManager,
2536 ) -> Result<()> {
2537 let host_addr = PciAddress::from_path(&device.path)
2538 .context("failed to parse hotplug device's PCI address")?;
2539 let hp_bus = get_hp_bus(linux, host_addr)?;
2540
2541 let (hotplug_key, pci_address) = match device.device_type {
2542 HotPlugDeviceType::UpstreamPort | HotPlugDeviceType::DownstreamPort => {
2543 let (vm_host_tube, vm_device_tube) = Tube::pair().context("failed to create tube")?;
2544 add_control_tube(TaggedControlTube::Vm(vm_host_tube).into());
2545 let (msi_host_tube, msi_device_tube) = Tube::pair().context("failed to create tube")?;
2546 add_control_tube(AnyControlTube::IrqTube(msi_host_tube));
2547 let pcie_host = PcieHostPort::new(device.path.as_path(), vm_device_tube)?;
2548 let (hotplug_key, pci_bridge) = match device.device_type {
2549 HotPlugDeviceType::UpstreamPort => {
2550 let hotplug_key = HotPlugKey::HostUpstreamPort { host_addr };
2551 let pcie_upstream_port = Arc::new(Mutex::new(PcieUpstreamPort::new_from_host(
2552 pcie_host, true,
2553 )?));
2554 let pci_bridge =
2555 Box::new(PciBridge::new(pcie_upstream_port.clone(), msi_device_tube));
2556 linux
2557 .hotplug_bus
2558 .insert(pci_bridge.get_secondary_num(), pcie_upstream_port);
2559 (hotplug_key, pci_bridge)
2560 }
2561 HotPlugDeviceType::DownstreamPort => {
2562 let hotplug_key = HotPlugKey::HostDownstreamPort { host_addr };
2563 let pcie_downstream_port = Arc::new(Mutex::new(
2564 PcieDownstreamPort::new_from_host(pcie_host, true)?,
2565 ));
2566 let pci_bridge = Box::new(PciBridge::new(
2567 pcie_downstream_port.clone(),
2568 msi_device_tube,
2569 ));
2570 linux
2571 .hotplug_bus
2572 .insert(pci_bridge.get_secondary_num(), pcie_downstream_port);
2573 (hotplug_key, pci_bridge)
2574 }
2575 _ => {
2576 bail!("Impossible to reach here")
2577 }
2578 };
2579 let pci_address = Arch::register_pci_device(
2580 linux,
2581 pci_bridge,
2582 None,
2583 sys_allocator,
2584 hp_control_tube,
2585 #[cfg(feature = "swap")]
2586 swap_controller,
2587 )?;
2588
2589 (hotplug_key, pci_address)
2590 }
2591 HotPlugDeviceType::EndPoint => {
2592 let hotplug_key = HotPlugKey::HostVfio { host_addr };
2593 let (vfio_device, jail, viommu_mapper) = create_vfio_device(
2594 cfg.jail_config.as_ref(),
2595 &linux.vm,
2596 sys_allocator,
2597 add_control_tube,
2598 &device.path,
2599 true,
2600 None,
2601 None,
2602 None,
2603 if iommu_host_tube.is_some() {
2604 IommuDevType::VirtioIommu
2605 } else {
2606 IommuDevType::NoIommu
2607 },
2608 None,
2609 vfio_container_manager,
2610 )?;
2611 let vfio_pci_device = match vfio_device {
2612 VfioDeviceVariant::Pci(pci) => Box::new(pci),
2613 VfioDeviceVariant::Platform(_) => bail!("vfio platform hotplug not supported"),
2614 };
2615 let pci_address = Arch::register_pci_device(
2616 linux,
2617 vfio_pci_device,
2618 jail,
2619 sys_allocator,
2620 hp_control_tube,
2621 #[cfg(feature = "swap")]
2622 swap_controller,
2623 )?;
2624 if let Some(iommu_host_tube) = iommu_host_tube {
2625 let endpoint_addr = pci_address.to_u32();
2626 let vfio_wrapper = viommu_mapper.context("expected mapper")?;
2627 let descriptor = vfio_wrapper.clone_as_raw_descriptor()?;
2628 let request =
2629 VirtioIOMMURequest::VfioCommand(VirtioIOMMUVfioCommand::VfioDeviceAdd {
2630 endpoint_addr,
2631 wrapper_id: vfio_wrapper.id(),
2632 container: {
2633 // SAFETY:
2634 // Safe because the descriptor is uniquely owned by `descriptor`.
2635 unsafe { File::from_raw_descriptor(descriptor) }
2636 },
2637 });
2638 match virtio_iommu_request(iommu_host_tube, &request)
2639 .map_err(|_| VirtioIOMMUVfioError::SocketFailed)?
2640 {
2641 VirtioIOMMUResponse::VfioResponse(VirtioIOMMUVfioResult::Ok) => (),
2642 resp => bail!("Unexpected message response: {:?}", resp),
2643 }
2644 }
2645
2646 (hotplug_key, pci_address)
2647 }
2648 };
2649 hp_bus.lock().add_hotplug_device(hotplug_key, pci_address);
2650 if device.hp_interrupt {
2651 hp_bus.lock().hot_plug(pci_address)?;
2652 }
2653 Ok(())
2654 }
2655
2656 #[cfg(feature = "pci-hotplug")]
add_hotplug_net<V: VmArch, Vcpu: VcpuArch>( linux: &mut RunnableLinuxVm<V, Vcpu>, sys_allocator: &mut SystemAllocator, add_control_tube: &mut impl FnMut(AnyControlTube), hotplug_manager: &mut PciHotPlugManager, net_param: NetParameters, ) -> Result<u8>2657 fn add_hotplug_net<V: VmArch, Vcpu: VcpuArch>(
2658 linux: &mut RunnableLinuxVm<V, Vcpu>,
2659 sys_allocator: &mut SystemAllocator,
2660 add_control_tube: &mut impl FnMut(AnyControlTube),
2661 hotplug_manager: &mut PciHotPlugManager,
2662 net_param: NetParameters,
2663 ) -> Result<u8> {
2664 let (msi_host_tube, msi_device_tube) = Tube::pair().context("create tube")?;
2665 add_control_tube(AnyControlTube::IrqTube(msi_host_tube));
2666 let (ioevent_host_tube, ioevent_device_tube) = Tube::pair().context("create tube")?;
2667 let ioevent_vm_memory_client = VmMemoryClient::new(ioevent_device_tube);
2668 add_control_tube(
2669 VmMemoryTube {
2670 tube: ioevent_host_tube,
2671 expose_with_viommu: false,
2672 }
2673 .into(),
2674 );
2675 let (vm_control_host_tube, vm_control_device_tube) = Tube::pair().context("create tube")?;
2676 add_control_tube(TaggedControlTube::Vm(vm_control_host_tube).into());
2677 let net_carrier_device = NetResourceCarrier::new(
2678 net_param,
2679 msi_device_tube,
2680 ioevent_vm_memory_client,
2681 vm_control_device_tube,
2682 );
2683 hotplug_manager.hotplug_device(
2684 vec![ResourceCarrier::VirtioNet(net_carrier_device)],
2685 linux,
2686 sys_allocator,
2687 )
2688 }
2689
2690 #[cfg(feature = "pci-hotplug")]
handle_hotplug_net_command<V: VmArch, Vcpu: VcpuArch>( net_cmd: NetControlCommand, linux: &mut RunnableLinuxVm<V, Vcpu>, sys_allocator: &mut SystemAllocator, add_control_tube: &mut impl FnMut(AnyControlTube), hotplug_manager: &mut PciHotPlugManager, ) -> VmResponse2691 fn handle_hotplug_net_command<V: VmArch, Vcpu: VcpuArch>(
2692 net_cmd: NetControlCommand,
2693 linux: &mut RunnableLinuxVm<V, Vcpu>,
2694 sys_allocator: &mut SystemAllocator,
2695 add_control_tube: &mut impl FnMut(AnyControlTube),
2696 hotplug_manager: &mut PciHotPlugManager,
2697 ) -> VmResponse {
2698 match net_cmd {
2699 NetControlCommand::AddTap(tap_name) => handle_hotplug_net_add(
2700 linux,
2701 sys_allocator,
2702 add_control_tube,
2703 hotplug_manager,
2704 &tap_name,
2705 ),
2706 NetControlCommand::RemoveTap(bus) => {
2707 handle_hotplug_net_remove(linux, sys_allocator, hotplug_manager, bus)
2708 }
2709 }
2710 }
2711
2712 #[cfg(feature = "pci-hotplug")]
handle_hotplug_net_add<V: VmArch, Vcpu: VcpuArch>( linux: &mut RunnableLinuxVm<V, Vcpu>, sys_allocator: &mut SystemAllocator, add_control_tube: &mut impl FnMut(AnyControlTube), hotplug_manager: &mut PciHotPlugManager, tap_name: &str, ) -> VmResponse2713 fn handle_hotplug_net_add<V: VmArch, Vcpu: VcpuArch>(
2714 linux: &mut RunnableLinuxVm<V, Vcpu>,
2715 sys_allocator: &mut SystemAllocator,
2716 add_control_tube: &mut impl FnMut(AnyControlTube),
2717 hotplug_manager: &mut PciHotPlugManager,
2718 tap_name: &str,
2719 ) -> VmResponse {
2720 let net_param_mode = NetParametersMode::TapName {
2721 tap_name: tap_name.to_owned(),
2722 mac: None,
2723 };
2724 let net_param = NetParameters {
2725 mode: net_param_mode,
2726 vhost_net: None,
2727 vq_pairs: None,
2728 packed_queue: false,
2729 pci_address: None,
2730 };
2731 let ret = add_hotplug_net(
2732 linux,
2733 sys_allocator,
2734 add_control_tube,
2735 hotplug_manager,
2736 net_param,
2737 );
2738
2739 match ret {
2740 Ok(pci_bus) => VmResponse::PciHotPlugResponse { bus: pci_bus },
2741 Err(e) => VmResponse::ErrString(format!("{:?}", e)),
2742 }
2743 }
2744
2745 #[cfg(feature = "pci-hotplug")]
handle_hotplug_net_remove<V: VmArch, Vcpu: VcpuArch>( linux: &mut RunnableLinuxVm<V, Vcpu>, sys_allocator: &mut SystemAllocator, hotplug_manager: &mut PciHotPlugManager, bus: u8, ) -> VmResponse2746 fn handle_hotplug_net_remove<V: VmArch, Vcpu: VcpuArch>(
2747 linux: &mut RunnableLinuxVm<V, Vcpu>,
2748 sys_allocator: &mut SystemAllocator,
2749 hotplug_manager: &mut PciHotPlugManager,
2750 bus: u8,
2751 ) -> VmResponse {
2752 match hotplug_manager.remove_hotplug_device(bus, linux, sys_allocator) {
2753 Ok(_) => VmResponse::Ok,
2754 Err(e) => VmResponse::ErrString(format!("{:?}", e)),
2755 }
2756 }
2757
2758 #[cfg(target_arch = "x86_64")]
remove_hotplug_bridge<V: VmArch, Vcpu: VcpuArch>( linux: &RunnableLinuxVm<V, Vcpu>, sys_allocator: &mut SystemAllocator, buses_to_remove: &mut Vec<u8>, hotplug_key: HotPlugKey, child_bus: u8, ) -> Result<()>2759 fn remove_hotplug_bridge<V: VmArch, Vcpu: VcpuArch>(
2760 linux: &RunnableLinuxVm<V, Vcpu>,
2761 sys_allocator: &mut SystemAllocator,
2762 buses_to_remove: &mut Vec<u8>,
2763 hotplug_key: HotPlugKey,
2764 child_bus: u8,
2765 ) -> Result<()> {
2766 for (bus_num, hp_bus) in linux.hotplug_bus.iter() {
2767 let mut hp_bus_lock = hp_bus.lock();
2768 if let Some(pci_addr) = hp_bus_lock.get_hotplug_device(hotplug_key) {
2769 sys_allocator.release_pci(pci_addr);
2770 hp_bus_lock.hot_unplug(pci_addr)?;
2771 buses_to_remove.push(child_bus);
2772 if hp_bus_lock.is_empty() {
2773 if let Some(hotplug_key) = hp_bus_lock.get_hotplug_key() {
2774 remove_hotplug_bridge(
2775 linux,
2776 sys_allocator,
2777 buses_to_remove,
2778 hotplug_key,
2779 *bus_num,
2780 )?;
2781 }
2782 }
2783 return Ok(());
2784 }
2785 }
2786
2787 Err(anyhow!(
2788 "Can not find device {:?} on hotplug buses",
2789 hotplug_key
2790 ))
2791 }
2792
2793 #[cfg(target_arch = "x86_64")]
remove_hotplug_device<V: VmArch, Vcpu: VcpuArch>( linux: &mut RunnableLinuxVm<V, Vcpu>, sys_allocator: &mut SystemAllocator, iommu_host_tube: Option<&Tube>, device: &HotPlugDeviceInfo, ) -> Result<()>2794 fn remove_hotplug_device<V: VmArch, Vcpu: VcpuArch>(
2795 linux: &mut RunnableLinuxVm<V, Vcpu>,
2796 sys_allocator: &mut SystemAllocator,
2797 iommu_host_tube: Option<&Tube>,
2798 device: &HotPlugDeviceInfo,
2799 ) -> Result<()> {
2800 let host_addr = PciAddress::from_path(&device.path)?;
2801 let hotplug_key = match device.device_type {
2802 HotPlugDeviceType::UpstreamPort => HotPlugKey::HostUpstreamPort { host_addr },
2803 HotPlugDeviceType::DownstreamPort => HotPlugKey::HostDownstreamPort { host_addr },
2804 HotPlugDeviceType::EndPoint => HotPlugKey::HostVfio { host_addr },
2805 };
2806
2807 let hp_bus = linux
2808 .hotplug_bus
2809 .iter()
2810 .find(|(_, hp_bus)| {
2811 let hp_bus = hp_bus.lock();
2812 hp_bus.get_hotplug_device(hotplug_key).is_some()
2813 })
2814 .map(|(bus_num, hp_bus)| (*bus_num, hp_bus.clone()));
2815
2816 if let Some((bus_num, hp_bus)) = hp_bus {
2817 let mut buses_to_remove = Vec::new();
2818 let mut removed_key = None;
2819 let mut hp_bus_lock = hp_bus.lock();
2820 if let Some(pci_addr) = hp_bus_lock.get_hotplug_device(hotplug_key) {
2821 if let Some(iommu_host_tube) = iommu_host_tube {
2822 let request =
2823 VirtioIOMMURequest::VfioCommand(VirtioIOMMUVfioCommand::VfioDeviceDel {
2824 endpoint_addr: pci_addr.to_u32(),
2825 });
2826 match virtio_iommu_request(iommu_host_tube, &request)
2827 .map_err(|_| VirtioIOMMUVfioError::SocketFailed)?
2828 {
2829 VirtioIOMMUResponse::VfioResponse(VirtioIOMMUVfioResult::Ok) => (),
2830 resp => bail!("Unexpected message response: {:?}", resp),
2831 }
2832 }
2833 let mut empty_simbling = true;
2834 if let Some(HotPlugKey::HostDownstreamPort { host_addr }) =
2835 hp_bus_lock.get_hotplug_key()
2836 {
2837 let addr_alias = host_addr;
2838 for (simbling_bus_num, hp_bus) in linux.hotplug_bus.iter() {
2839 if *simbling_bus_num != bus_num {
2840 let hp_bus_lock = hp_bus.lock();
2841 let hotplug_key = hp_bus_lock.get_hotplug_key();
2842 if let Some(HotPlugKey::HostDownstreamPort { host_addr }) = hotplug_key {
2843 if addr_alias.bus == host_addr.bus && !hp_bus_lock.is_empty() {
2844 empty_simbling = false;
2845 break;
2846 }
2847 }
2848 }
2849 }
2850 }
2851
2852 // If all simbling downstream ports are empty, do not send hot unplug event for this
2853 // downstream port. Root port will send one plug out interrupt and remove all
2854 // the remaining devices
2855 if !empty_simbling {
2856 hp_bus_lock.hot_unplug(pci_addr)?;
2857 }
2858
2859 sys_allocator.release_pci(pci_addr);
2860 if empty_simbling || hp_bus_lock.is_empty() {
2861 if let Some(hotplug_key) = hp_bus_lock.get_hotplug_key() {
2862 removed_key = Some(hotplug_key);
2863 remove_hotplug_bridge(
2864 linux,
2865 sys_allocator,
2866 &mut buses_to_remove,
2867 hotplug_key,
2868 bus_num,
2869 )?;
2870 }
2871 }
2872 }
2873
2874 // Some types of TBT device has a few empty downstream ports. The emulated bridges
2875 // of these ports won't be removed since no vfio device is connected to our emulated
2876 // bridges. So we explicitly check all simbling bridges of the removed bridge here,
2877 // and remove them if bridge has no child device connected.
2878 if let Some(HotPlugKey::HostDownstreamPort { host_addr }) = removed_key {
2879 let addr_alias = host_addr;
2880 for (simbling_bus_num, hp_bus) in linux.hotplug_bus.iter() {
2881 if *simbling_bus_num != bus_num {
2882 let hp_bus_lock = hp_bus.lock();
2883 let hotplug_key = hp_bus_lock.get_hotplug_key();
2884 if let Some(HotPlugKey::HostDownstreamPort { host_addr }) = hotplug_key {
2885 if addr_alias.bus == host_addr.bus && hp_bus_lock.is_empty() {
2886 remove_hotplug_bridge(
2887 linux,
2888 sys_allocator,
2889 &mut buses_to_remove,
2890 hotplug_key.unwrap(),
2891 *simbling_bus_num,
2892 )?;
2893 }
2894 }
2895 }
2896 }
2897 }
2898 for bus in buses_to_remove.iter() {
2899 linux.hotplug_bus.remove(bus);
2900 }
2901 return Ok(());
2902 }
2903
2904 Err(anyhow!(
2905 "Can not find device {:?} on hotplug buses",
2906 hotplug_key
2907 ))
2908 }
2909
trigger_vm_suspend_and_wait_for_entry( guest_suspended_cvar: Arc<(Mutex<bool>, Condvar)>, tube: &SendTube, response: vm_control::VmResponse, suspend_tube: Arc<Mutex<SendTube>>, pm: Option<Arc<Mutex<dyn PmResource + Send>>>, )2910 pub fn trigger_vm_suspend_and_wait_for_entry(
2911 guest_suspended_cvar: Arc<(Mutex<bool>, Condvar)>,
2912 tube: &SendTube,
2913 response: vm_control::VmResponse,
2914 suspend_tube: Arc<Mutex<SendTube>>,
2915 pm: Option<Arc<Mutex<dyn PmResource + Send>>>,
2916 ) {
2917 let (lock, cvar) = &*guest_suspended_cvar;
2918 let mut guest_suspended = lock.lock();
2919
2920 *guest_suspended = false;
2921
2922 // During suspend also emulate sleepbtn, which allows to suspend VM (if running e.g. acpid and
2923 // reacts on sleep button events)
2924 if let Some(pm) = pm {
2925 pm.lock().slpbtn_evt();
2926 } else {
2927 error!("generating sleepbtn during suspend not supported");
2928 }
2929
2930 // Wait for notification about guest suspension, if not received after 15sec,
2931 // proceed anyway.
2932 let result = cvar.wait_timeout(guest_suspended, std::time::Duration::from_secs(15));
2933 guest_suspended = result.0;
2934
2935 if result.1.timed_out() {
2936 warn!("Guest suspension timeout - proceeding anyway");
2937 } else if *guest_suspended {
2938 info!("Guest suspended");
2939 }
2940
2941 if let Err(e) = suspend_tube.lock().send(&true) {
2942 error!("failed to trigger suspend event: {}", e);
2943 }
2944 // Now we ready to send response over the tube and communicate that VM suspend has finished
2945 if let Err(e) = tube.send(&response) {
2946 error!("failed to send VmResponse: {}", e);
2947 }
2948 }
2949
2950 #[cfg(feature = "pvclock")]
2951 #[derive(Debug)]
2952 /// The action requested by the pvclock device to perform on the main thread.
2953 enum PvClockAction {
2954 #[cfg(target_arch = "aarch64")]
2955 /// Update the counter offset with VmAarch64::set_counter_offset.
2956 SetCounterOffset(u64),
2957 }
2958
2959 #[cfg(feature = "pvclock")]
send_pvclock_cmd(tube: &Tube, command: PvClockCommand) -> Result<Option<PvClockAction>>2960 fn send_pvclock_cmd(tube: &Tube, command: PvClockCommand) -> Result<Option<PvClockAction>> {
2961 tube.send(&command)
2962 .with_context(|| format!("failed to send pvclock command {:?}", command))?;
2963 let resp = tube
2964 .recv::<PvClockCommandResponse>()
2965 .context("failed to receive pvclock command response")?;
2966 match resp {
2967 PvClockCommandResponse::Err(e) => {
2968 bail!("pvclock encountered error on {:?}: {}", command, e);
2969 }
2970 PvClockCommandResponse::DeviceInactive => {
2971 warn!("Tried to send {command:?} but pvclock device was inactive");
2972 Ok(None)
2973 }
2974 PvClockCommandResponse::Resumed {
2975 total_suspended_ticks,
2976 } => {
2977 info!("{command:?} completed with {total_suspended_ticks} total_suspended_ticks");
2978 cfg_if::cfg_if! {
2979 if #[cfg(target_arch = "aarch64")] {
2980 Ok(Some(PvClockAction::SetCounterOffset(total_suspended_ticks)))
2981 } else {
2982 // For non-AArch64 platforms this is handled by directly updating the offset in
2983 // shared memory in the pvclock device worker.
2984 Ok(None)
2985 }
2986 }
2987 }
2988 PvClockCommandResponse::Ok => {
2989 info!("{command:?} completed with {resp:?}");
2990 Ok(None)
2991 }
2992 }
2993 }
2994
2995 #[cfg(target_arch = "x86_64")]
handle_hotplug_command<V: VmArch, Vcpu: VcpuArch>( linux: &mut RunnableLinuxVm<V, Vcpu>, sys_allocator: &mut SystemAllocator, cfg: &Config, add_control_tube: &mut impl FnMut(AnyControlTube), hp_control_tube: &mpsc::Sender<PciRootCommand>, iommu_host_tube: Option<&Tube>, device: &HotPlugDeviceInfo, add: bool, #[cfg(feature = "swap")] swap_controller: &mut Option<SwapController>, vfio_container_manager: &mut VfioContainerManager, ) -> VmResponse2996 fn handle_hotplug_command<V: VmArch, Vcpu: VcpuArch>(
2997 linux: &mut RunnableLinuxVm<V, Vcpu>,
2998 sys_allocator: &mut SystemAllocator,
2999 cfg: &Config,
3000 add_control_tube: &mut impl FnMut(AnyControlTube),
3001 hp_control_tube: &mpsc::Sender<PciRootCommand>,
3002 iommu_host_tube: Option<&Tube>,
3003 device: &HotPlugDeviceInfo,
3004 add: bool,
3005 #[cfg(feature = "swap")] swap_controller: &mut Option<SwapController>,
3006 vfio_container_manager: &mut VfioContainerManager,
3007 ) -> VmResponse {
3008 let iommu_host_tube = if cfg.vfio_isolate_hotplug {
3009 iommu_host_tube
3010 } else {
3011 None
3012 };
3013
3014 let ret = if add {
3015 add_hotplug_device(
3016 linux,
3017 sys_allocator,
3018 cfg,
3019 add_control_tube,
3020 hp_control_tube,
3021 iommu_host_tube,
3022 device,
3023 #[cfg(feature = "swap")]
3024 swap_controller,
3025 vfio_container_manager,
3026 )
3027 } else {
3028 remove_hotplug_device(linux, sys_allocator, iommu_host_tube, device)
3029 };
3030
3031 match ret {
3032 Ok(()) => VmResponse::Ok,
3033 Err(e) => {
3034 error!("handle_hotplug_command failure: {}", e);
3035 VmResponse::Err(base::Error::new(libc::EINVAL))
3036 }
3037 }
3038 }
3039
3040 struct ControlLoopState<'a, V: VmArch, Vcpu: VcpuArch> {
3041 linux: &'a mut RunnableLinuxVm<V, Vcpu>,
3042 cfg: &'a Config,
3043 sys_allocator: &'a Arc<Mutex<SystemAllocator>>,
3044 control_tubes: &'a BTreeMap<usize, TaggedControlTube>,
3045 disk_host_tubes: &'a [Tube],
3046 #[cfg(feature = "audio")]
3047 snd_host_tubes: &'a [Tube],
3048 #[cfg(feature = "gpu")]
3049 gpu_control_tube: Option<&'a Tube>,
3050 #[cfg(feature = "usb")]
3051 usb_control_tube: &'a Tube,
3052 #[cfg(target_arch = "x86_64")]
3053 iommu_host_tube: &'a Option<Arc<Mutex<Tube>>>,
3054 #[cfg(target_arch = "x86_64")]
3055 hp_control_tube: &'a mpsc::Sender<PciRootCommand>,
3056 guest_suspended_cvar: &'a Option<Arc<(Mutex<bool>, Condvar)>>,
3057 #[cfg(feature = "pci-hotplug")]
3058 hotplug_manager: &'a mut Option<PciHotPlugManager>,
3059 #[cfg(feature = "swap")]
3060 swap_controller: &'a mut Option<SwapController>,
3061 vcpu_handles: &'a [(JoinHandle<()>, mpsc::Sender<vm_control::VcpuControl>)],
3062 #[cfg(feature = "balloon")]
3063 balloon_tube: Option<&'a mut BalloonTube>,
3064 device_ctrl_tube: &'a Tube,
3065 irq_handler_control: &'a Tube,
3066 #[cfg(any(target_arch = "x86_64", feature = "pci-hotplug"))]
3067 vm_memory_handler_control: &'a Tube,
3068 #[cfg(feature = "registered_events")]
3069 registered_evt_tubes: &'a mut HashMap<RegisteredEvent, HashSet<AddressedProtoTube>>,
3070 #[cfg(feature = "pvclock")]
3071 pvclock_host_tube: Option<Arc<Tube>>,
3072 vfio_container_manager: &'a mut VfioContainerManager,
3073 suspended_pvclock_state: &'a mut Option<hypervisor::ClockState>,
3074 vcpus_pid_tid: &'a BTreeMap<usize, (u32, u32)>,
3075 }
3076
3077 struct VmRequestResult {
3078 response: Option<VmResponse>,
3079 exit: bool,
3080 }
3081
3082 impl VmRequestResult {
new(response: Option<VmResponse>, exit: bool) -> Self3083 fn new(response: Option<VmResponse>, exit: bool) -> Self {
3084 VmRequestResult { response, exit }
3085 }
3086 }
3087
process_vm_request<V: VmArch + 'static, Vcpu: VcpuArch + 'static>( state: &mut ControlLoopState<V, Vcpu>, id: usize, tube: &Tube, request: VmRequest, #[cfg_attr( not(any(target_arch = "x86_64", feature = "pci-hotplug")), allow(unused_variables, clippy::ptr_arg) )] add_tubes: &mut Vec<TaggedControlTube>, ) -> Result<VmRequestResult>3088 fn process_vm_request<V: VmArch + 'static, Vcpu: VcpuArch + 'static>(
3089 state: &mut ControlLoopState<V, Vcpu>,
3090 id: usize,
3091 tube: &Tube,
3092 request: VmRequest,
3093 #[cfg_attr(
3094 not(any(target_arch = "x86_64", feature = "pci-hotplug")),
3095 allow(unused_variables, clippy::ptr_arg)
3096 )]
3097 add_tubes: &mut Vec<TaggedControlTube>,
3098 ) -> Result<VmRequestResult> {
3099 #[cfg(any(target_arch = "x86_64", feature = "pci-hotplug"))]
3100 let mut add_irq_control_tubes = Vec::new();
3101 #[cfg(any(target_arch = "x86_64", feature = "pci-hotplug"))]
3102 let mut add_vm_memory_control_tubes = Vec::new();
3103
3104 #[cfg(any(target_arch = "x86_64", feature = "pci-hotplug"))]
3105 let mut add_control_tube = |t| match t {
3106 AnyControlTube::DeviceControlTube(_) => {
3107 panic!("hotplugging DeviceControlTube not supported yet")
3108 }
3109 AnyControlTube::IrqTube(t) => add_irq_control_tubes.push(t),
3110 AnyControlTube::TaggedControlTube(t) => add_tubes.push(t),
3111 AnyControlTube::VmMemoryTube(t) => add_vm_memory_control_tubes.push(t),
3112 };
3113
3114 let response = match request {
3115 VmRequest::Exit => {
3116 return Ok(VmRequestResult::new(Some(VmResponse::Ok), true));
3117 }
3118 VmRequest::HotPlugVfioCommand { device, add } => {
3119 #[cfg(target_arch = "x86_64")]
3120 {
3121 handle_hotplug_command(
3122 state.linux,
3123 &mut state.sys_allocator.lock(),
3124 state.cfg,
3125 &mut add_control_tube,
3126 state.hp_control_tube,
3127 state.iommu_host_tube.as_ref().map(|t| t.lock()).as_deref(),
3128 &device,
3129 add,
3130 #[cfg(feature = "swap")]
3131 state.swap_controller,
3132 state.vfio_container_manager,
3133 )
3134 }
3135
3136 #[cfg(not(target_arch = "x86_64"))]
3137 {
3138 // Suppress warnings.
3139 let _ = (device, add);
3140 let _ = &state.vfio_container_manager;
3141 VmResponse::Ok
3142 }
3143 }
3144 #[cfg(feature = "pci-hotplug")]
3145 VmRequest::HotPlugNetCommand(net_cmd) => {
3146 if let Some(hotplug_manager) = state.hotplug_manager.as_mut() {
3147 handle_hotplug_net_command(
3148 net_cmd,
3149 state.linux,
3150 &mut state.sys_allocator.lock(),
3151 &mut add_control_tube,
3152 hotplug_manager,
3153 )
3154 } else {
3155 VmResponse::ErrString("PCI hotplug is not enabled.".to_owned())
3156 }
3157 }
3158 #[cfg(feature = "registered_events")]
3159 VmRequest::RegisterListener { socket_addr, event } => {
3160 let (registered_tube, already_registered) =
3161 find_registered_tube(state.registered_evt_tubes, &socket_addr, event);
3162
3163 if !already_registered {
3164 let addr_tube = make_addr_tube_from_maybe_existing(registered_tube, socket_addr)?;
3165
3166 if let Some(tubes) = state.registered_evt_tubes.get_mut(&event) {
3167 tubes.insert(addr_tube);
3168 } else {
3169 state
3170 .registered_evt_tubes
3171 .insert(event, vec![addr_tube].into_iter().collect());
3172 }
3173 }
3174 VmResponse::Ok
3175 }
3176 #[cfg(feature = "registered_events")]
3177 VmRequest::UnregisterListener { socket_addr, event } => {
3178 if let Some(tubes) = state.registered_evt_tubes.get_mut(&event) {
3179 tubes.retain(|t| t.socket_addr != socket_addr);
3180 }
3181 state
3182 .registered_evt_tubes
3183 .retain(|_, tubes| !tubes.is_empty());
3184 VmResponse::Ok
3185 }
3186 #[cfg(feature = "registered_events")]
3187 VmRequest::Unregister { socket_addr } => {
3188 for (_, tubes) in state.registered_evt_tubes.iter_mut() {
3189 tubes.retain(|t| t.socket_addr != socket_addr);
3190 }
3191 state
3192 .registered_evt_tubes
3193 .retain(|_, tubes| !tubes.is_empty());
3194 VmResponse::Ok
3195 }
3196 #[cfg(feature = "balloon")]
3197 VmRequest::BalloonCommand(cmd) => {
3198 if let Some(tube) = state.balloon_tube.as_mut() {
3199 let Some((r, key)) = tube.send_cmd(cmd, Some(id)) else {
3200 return Ok(VmRequestResult::new(None, false));
3201 };
3202 if key != id {
3203 let Some(TaggedControlTube::Vm(tube)) = state.control_tubes.get(&key) else {
3204 return Ok(VmRequestResult::new(None, false));
3205 };
3206 if let Err(e) = tube.send(&r) {
3207 error!("failed to send VmResponse: {}", e);
3208 }
3209 return Ok(VmRequestResult::new(None, false));
3210 }
3211 r
3212 } else {
3213 VmResponse::Err(base::Error::new(libc::ENOTSUP))
3214 }
3215 }
3216 VmRequest::VcpuPidTid => VmResponse::VcpuPidTidResponse {
3217 pid_tid_map: state.vcpus_pid_tid.clone(),
3218 },
3219 VmRequest::Throttle(vcpu, cycles) => {
3220 vcpu::kick_vcpu(
3221 &state.vcpu_handles.get(vcpu),
3222 state.linux.irq_chip.as_irq_chip(),
3223 VcpuControl::Throttle(cycles),
3224 );
3225 return Ok(VmRequestResult::new(None, false));
3226 }
3227 _ => {
3228 if !state.cfg.force_s2idle {
3229 #[cfg(feature = "pvclock")]
3230 if let Some(ref pvclock_host_tube) = state.pvclock_host_tube {
3231 // Update clock offset when pvclock is used.
3232 if let VmRequest::ResumeVcpus = request {
3233 let cmd = PvClockCommand::Resume;
3234 match send_pvclock_cmd(pvclock_host_tube, cmd.clone()) {
3235 Ok(action) => {
3236 info!("{:?} command successfully processed", cmd);
3237 if let Some(action) = action {
3238 match action {
3239 #[cfg(target_arch = "aarch64")]
3240 PvClockAction::SetCounterOffset(offset) => {
3241 state.linux.vm.set_counter_offset(offset)?;
3242 }
3243 }
3244 }
3245 }
3246 Err(e) => error!("{:?} command failed: {:#}", cmd, e),
3247 };
3248 }
3249 }
3250 }
3251 let kick_all_vcpus = |msg| {
3252 if let VcpuControl::RunState(VmRunMode::Running) = msg {
3253 for dev in &state.linux.resume_notify_devices {
3254 dev.lock().resume_imminent();
3255 }
3256 }
3257 vcpu::kick_all_vcpus(state.vcpu_handles, state.linux.irq_chip.as_irq_chip(), msg);
3258 };
3259 let response = request.execute(
3260 &state.linux.vm,
3261 state.disk_host_tubes,
3262 #[cfg(feature = "audio")]
3263 state.snd_host_tubes,
3264 #[cfg(not(feature = "audio"))]
3265 &[],
3266 &mut state.linux.pm,
3267 #[cfg(feature = "gpu")]
3268 state.gpu_control_tube,
3269 #[cfg(not(feature = "gpu"))]
3270 None,
3271 #[cfg(feature = "usb")]
3272 Some(state.usb_control_tube),
3273 #[cfg(not(feature = "usb"))]
3274 None,
3275 &mut state.linux.bat_control,
3276 kick_all_vcpus,
3277 |index, msg| {
3278 vcpu::kick_vcpu(
3279 &state.vcpu_handles.get(index),
3280 state.linux.irq_chip.as_irq_chip(),
3281 msg,
3282 )
3283 },
3284 state.cfg.force_s2idle,
3285 #[cfg(feature = "swap")]
3286 state.swap_controller.as_ref(),
3287 state.device_ctrl_tube,
3288 state.vcpu_handles.len(),
3289 state.irq_handler_control,
3290 || state.linux.irq_chip.snapshot(state.linux.vcpu_count),
3291 state.suspended_pvclock_state,
3292 );
3293 if state.cfg.force_s2idle {
3294 if let VmRequest::SuspendVcpus = request {
3295 // Spawn s2idle wait thread.
3296 let send_tube = tube.try_clone_send_tube().unwrap();
3297 let suspend_tube = state.linux.suspend_tube.0.clone();
3298 let guest_suspended_cvar = state.guest_suspended_cvar.clone();
3299 let pm = state.linux.pm.clone();
3300
3301 std::thread::Builder::new()
3302 .name("s2idle_wait".to_owned())
3303 .spawn(move || {
3304 trigger_vm_suspend_and_wait_for_entry(
3305 guest_suspended_cvar.unwrap(),
3306 &send_tube,
3307 response,
3308 suspend_tube,
3309 pm,
3310 )
3311 })
3312 .context("failed to spawn s2idle_wait thread")?;
3313
3314 // For s2idle, omit the response since it will be sent by
3315 // s2idle_wait thread when suspension actually happens.
3316 return Ok(VmRequestResult::new(None, false));
3317 }
3318 } else {
3319 #[cfg(feature = "pvclock")]
3320 if let Some(ref pvclock_host_tube) = state.pvclock_host_tube {
3321 // Record the time after VCPUs are suspended to track suspension duration.
3322 if let VmRequest::SuspendVcpus = request {
3323 let cmd = PvClockCommand::Suspend;
3324 match send_pvclock_cmd(pvclock_host_tube, cmd.clone()) {
3325 Ok(action) => {
3326 info!("{:?} command successfully processed", cmd);
3327 if let Some(action) = action {
3328 error!("Unexpected action {:?} requested for suspend", action);
3329 }
3330 }
3331 Err(e) => error!("{:?} command failed: {:#}", cmd, e),
3332 };
3333 }
3334 }
3335 }
3336 response
3337 }
3338 };
3339
3340 cfg_if::cfg_if! {
3341 if #[cfg(any(target_arch = "x86_64", feature = "pci-hotplug"))] {
3342 if !add_irq_control_tubes.is_empty() {
3343 state
3344 .irq_handler_control
3345 .send(&IrqHandlerRequest::AddIrqControlTubes(
3346 add_irq_control_tubes,
3347 ))?;
3348 }
3349 if !add_vm_memory_control_tubes.is_empty() {
3350 state
3351 .vm_memory_handler_control
3352 .send(&VmMemoryHandlerRequest::AddControlTubes(
3353 add_vm_memory_control_tubes,
3354 ))?;
3355 }
3356 }
3357 }
3358
3359 Ok(VmRequestResult::new(Some(response), false))
3360 }
3361
process_vm_control_event<V: VmArch + 'static, Vcpu: VcpuArch + 'static>( state: &mut ControlLoopState<V, Vcpu>, id: usize, socket: &TaggedControlTube, ) -> Result<(bool, Vec<usize>, Vec<TaggedControlTube>)>3362 fn process_vm_control_event<V: VmArch + 'static, Vcpu: VcpuArch + 'static>(
3363 state: &mut ControlLoopState<V, Vcpu>,
3364 id: usize,
3365 socket: &TaggedControlTube,
3366 ) -> Result<(bool, Vec<usize>, Vec<TaggedControlTube>)> {
3367 let mut vm_control_ids_to_remove = Vec::new();
3368 let mut add_tubes = Vec::new();
3369 match socket {
3370 TaggedControlTube::Vm(tube) => match tube.recv::<VmRequest>() {
3371 Ok(request) => {
3372 let res = process_vm_request(state, id, tube, request, &mut add_tubes)?;
3373
3374 if let Some(response) = res.response {
3375 if let Err(e) = tube.send(&response) {
3376 error!("failed to send VmResponse: {}", e);
3377 }
3378 }
3379
3380 if res.exit {
3381 return Ok((true, Vec::new(), Vec::new()));
3382 }
3383 }
3384 Err(e) => {
3385 if let TubeError::Disconnected = e {
3386 vm_control_ids_to_remove.push(id);
3387 } else {
3388 error!("failed to recv VmRequest: {}", e);
3389 }
3390 }
3391 },
3392 TaggedControlTube::VmMsync(tube) => match tube.recv::<VmMemoryMappingRequest>() {
3393 Ok(request) => {
3394 let response = request.execute(&mut state.linux.vm);
3395 if let Err(e) = tube.send(&response) {
3396 error!("failed to send VmMsyncResponse: {}", e);
3397 }
3398 }
3399 Err(e) => {
3400 if let TubeError::Disconnected = e {
3401 vm_control_ids_to_remove.push(id);
3402 } else {
3403 error!("failed to recv VmMsyncRequest: {}", e);
3404 }
3405 }
3406 },
3407 TaggedControlTube::Fs(tube) => match tube.recv::<FsMappingRequest>() {
3408 Ok(request) => {
3409 let response =
3410 request.execute(&mut state.linux.vm, &mut state.sys_allocator.lock());
3411 if let Err(e) = tube.send(&response) {
3412 error!("failed to send VmResponse: {}", e);
3413 }
3414 }
3415 Err(e) => {
3416 if let TubeError::Disconnected = e {
3417 vm_control_ids_to_remove.push(id);
3418 } else {
3419 error!("failed to recv VmResponse: {}", e);
3420 }
3421 }
3422 },
3423 }
3424
3425 Ok((false, vm_control_ids_to_remove, add_tubes))
3426 }
3427
3428 #[cfg(feature = "registered_events")]
3429 struct AddressedProtoTube {
3430 tube: Rc<ProtoTube>,
3431 socket_addr: String,
3432 }
3433
3434 #[cfg(feature = "registered_events")]
3435 impl PartialEq for AddressedProtoTube {
eq(&self, other: &Self) -> bool3436 fn eq(&self, other: &Self) -> bool {
3437 self.socket_addr == other.socket_addr
3438 }
3439 }
3440
3441 #[cfg(feature = "registered_events")]
3442 impl Eq for AddressedProtoTube {}
3443
3444 #[cfg(feature = "registered_events")]
3445 impl Hash for AddressedProtoTube {
hash<H: std::hash::Hasher>(&self, state: &mut H)3446 fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
3447 self.socket_addr.hash(state);
3448 }
3449 }
3450
3451 #[cfg(feature = "registered_events")]
3452 impl AddressedProtoTube {
send<M: protobuf::Message>(&self, msg: &M) -> Result<(), base::TubeError>3453 pub fn send<M: protobuf::Message>(&self, msg: &M) -> Result<(), base::TubeError> {
3454 self.tube.send_proto(msg)
3455 }
3456 }
3457
3458 #[cfg(feature = "registered_events")]
find_registered_tube<'a>( registered_tubes: &'a HashMap<RegisteredEvent, HashSet<AddressedProtoTube>>, socket_addr: &str, event: RegisteredEvent, ) -> (Option<&'a Rc<ProtoTube>>, bool)3459 fn find_registered_tube<'a>(
3460 registered_tubes: &'a HashMap<RegisteredEvent, HashSet<AddressedProtoTube>>,
3461 socket_addr: &str,
3462 event: RegisteredEvent,
3463 ) -> (Option<&'a Rc<ProtoTube>>, bool) {
3464 let mut registered_tube: Option<&Rc<ProtoTube>> = None;
3465 let mut already_registered = false;
3466 'outer: for (evt, addr_tubes) in registered_tubes {
3467 for addr_tube in addr_tubes {
3468 if addr_tube.socket_addr == socket_addr {
3469 if *evt == event {
3470 already_registered = true;
3471 break 'outer;
3472 }
3473 // Since all tubes of the same addr should
3474 // be an RC to the same tube, it doesn't
3475 // matter which one we get. But we do need
3476 // to check for a registration for the
3477 // current event, so can't break here.
3478 registered_tube = Some(&addr_tube.tube);
3479 }
3480 }
3481 }
3482 (registered_tube, already_registered)
3483 }
3484
3485 #[cfg(feature = "registered_events")]
make_addr_tube_from_maybe_existing( tube: Option<&Rc<ProtoTube>>, addr: String, ) -> Result<AddressedProtoTube>3486 fn make_addr_tube_from_maybe_existing(
3487 tube: Option<&Rc<ProtoTube>>,
3488 addr: String,
3489 ) -> Result<AddressedProtoTube> {
3490 if let Some(registered_tube) = tube {
3491 Ok(AddressedProtoTube {
3492 tube: registered_tube.clone(),
3493 socket_addr: addr,
3494 })
3495 } else {
3496 let sock = UnixSeqpacket::connect(addr.clone()).with_context(|| {
3497 format!("failed to connect to registered listening socket {}", addr)
3498 })?;
3499 let tube = ProtoTube::from(Tube::try_from(sock)?);
3500 Ok(AddressedProtoTube {
3501 tube: Rc::new(tube),
3502 socket_addr: addr,
3503 })
3504 }
3505 }
3506
run_control<V: VmArch + 'static, Vcpu: VcpuArch + 'static>( mut linux: RunnableLinuxVm<V, Vcpu>, sys_allocator: SystemAllocator, cfg: Config, control_server_socket: Option<UnlinkUnixSeqpacketListener>, all_control_tubes: Vec<AnyControlTube>, #[cfg(feature = "usb")] usb_control_tube: Tube, vm_evt_rdtube: RecvTube, vm_evt_wrtube: SendTube, sigchld_fd: SignalFd, gralloc: RutabagaGralloc, vcpu_ids: Vec<usize>, iommu_host_tube: Option<Tube>, #[cfg(target_arch = "x86_64")] hp_control_tube: mpsc::Sender<PciRootCommand>, #[cfg(target_arch = "x86_64")] hp_thread: std::thread::JoinHandle<()>, #[cfg(feature = "pci-hotplug")] mut hotplug_manager: Option<PciHotPlugManager>, #[allow(unused_mut)] #[cfg(feature = "swap")] mut swap_controller: Option<SwapController>, #[cfg(feature = "registered_events")] reg_evt_rdtube: RecvTube, guest_suspended_cvar: Option<Arc<(Mutex<bool>, Condvar)>>, metrics_tube: RecvTube, mut vfio_container_manager: VfioContainerManager, mut worker_process_pids: BTreeSet<Pid>, #[cfg(any(target_arch = "arm", target_arch = "aarch64"))] vcpu_domain_paths: BTreeMap< usize, PathBuf, >, ) -> Result<ExitState>3507 fn run_control<V: VmArch + 'static, Vcpu: VcpuArch + 'static>(
3508 mut linux: RunnableLinuxVm<V, Vcpu>,
3509 sys_allocator: SystemAllocator,
3510 cfg: Config,
3511 control_server_socket: Option<UnlinkUnixSeqpacketListener>,
3512 all_control_tubes: Vec<AnyControlTube>,
3513 #[cfg(feature = "usb")] usb_control_tube: Tube,
3514 vm_evt_rdtube: RecvTube,
3515 vm_evt_wrtube: SendTube,
3516 sigchld_fd: SignalFd,
3517 gralloc: RutabagaGralloc,
3518 vcpu_ids: Vec<usize>,
3519 iommu_host_tube: Option<Tube>,
3520 #[cfg(target_arch = "x86_64")] hp_control_tube: mpsc::Sender<PciRootCommand>,
3521 #[cfg(target_arch = "x86_64")] hp_thread: std::thread::JoinHandle<()>,
3522 #[cfg(feature = "pci-hotplug")] mut hotplug_manager: Option<PciHotPlugManager>,
3523 #[allow(unused_mut)] // mut is required x86 only
3524 #[cfg(feature = "swap")]
3525 mut swap_controller: Option<SwapController>,
3526 #[cfg(feature = "registered_events")] reg_evt_rdtube: RecvTube,
3527 guest_suspended_cvar: Option<Arc<(Mutex<bool>, Condvar)>>,
3528 metrics_tube: RecvTube,
3529 mut vfio_container_manager: VfioContainerManager,
3530 // A set of PID of child processes whose clean exit is expected and can be ignored.
3531 mut worker_process_pids: BTreeSet<Pid>,
3532 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))] vcpu_domain_paths: BTreeMap<
3533 usize,
3534 PathBuf,
3535 >,
3536 ) -> Result<ExitState> {
3537 // Split up `all_control_tubes`.
3538 #[cfg(feature = "balloon")]
3539 let mut balloon_host_tube = None;
3540 let mut disk_host_tubes = Vec::new();
3541 #[cfg(feature = "gpu")]
3542 let mut gpu_control_tube = None;
3543 #[cfg(feature = "pvclock")]
3544 let mut pvclock_host_tube = None;
3545 #[cfg(feature = "audio")]
3546 let mut snd_host_tubes = Vec::new();
3547 let mut irq_control_tubes = Vec::new();
3548 let mut vm_memory_control_tubes = Vec::new();
3549 let mut control_tubes = Vec::new();
3550 for t in all_control_tubes {
3551 match t {
3552 #[cfg(feature = "balloon")]
3553 AnyControlTube::DeviceControlTube(DeviceControlTube::Balloon(t)) => {
3554 assert!(balloon_host_tube.is_none());
3555 balloon_host_tube = Some(t)
3556 }
3557 AnyControlTube::DeviceControlTube(DeviceControlTube::Disk(t)) => {
3558 disk_host_tubes.push(t)
3559 }
3560 #[cfg(feature = "gpu")]
3561 AnyControlTube::DeviceControlTube(DeviceControlTube::Gpu(t)) => {
3562 assert!(gpu_control_tube.is_none());
3563 gpu_control_tube = Some(t)
3564 }
3565 #[cfg(feature = "pvclock")]
3566 AnyControlTube::DeviceControlTube(DeviceControlTube::PvClock(t)) => {
3567 assert!(pvclock_host_tube.is_none());
3568 pvclock_host_tube = Some(Arc::new(t))
3569 }
3570 #[cfg(feature = "audio")]
3571 AnyControlTube::DeviceControlTube(DeviceControlTube::Snd(t)) => {
3572 snd_host_tubes.push(t);
3573 }
3574 AnyControlTube::IrqTube(t) => irq_control_tubes.push(t),
3575 AnyControlTube::TaggedControlTube(t) => control_tubes.push(t),
3576 AnyControlTube::VmMemoryTube(t) => vm_memory_control_tubes.push(t),
3577 }
3578 }
3579
3580 #[cfg(feature = "gdb")]
3581 let (to_gdb_channel, gdb) = if let Some(port) = cfg.gdb {
3582 // GDB needs a control socket to interrupt vcpus.
3583 let (gdb_host_tube, gdb_control_tube) = Tube::pair().context("failed to create tube")?;
3584 control_tubes.push(TaggedControlTube::Vm(gdb_host_tube));
3585 // Create a channel for GDB thread.
3586 let (to_gdb_channel, from_vcpu_channel) = mpsc::channel();
3587 (
3588 Some(to_gdb_channel),
3589 Some((port, gdb_control_tube, from_vcpu_channel)),
3590 )
3591 } else {
3592 (None, None)
3593 };
3594
3595 #[derive(EventToken)]
3596 enum Token {
3597 VmEvent,
3598 Suspend,
3599 ChildSignal,
3600 VmControlServer,
3601 VmControl {
3602 id: usize,
3603 },
3604 #[cfg(feature = "registered_events")]
3605 RegisteredEvent,
3606 #[cfg(feature = "balloon")]
3607 BalloonTube,
3608 }
3609 stdin()
3610 .set_raw_mode()
3611 .expect("failed to set terminal raw mode");
3612
3613 let sys_allocator_mutex = Arc::new(Mutex::new(sys_allocator));
3614 let iommu_host_tube = iommu_host_tube.map(|t| Arc::new(Mutex::new(t)));
3615
3616 let wait_ctx = WaitContext::build_with(&[
3617 (&linux.suspend_tube.1, Token::Suspend),
3618 (&sigchld_fd, Token::ChildSignal),
3619 (&vm_evt_rdtube, Token::VmEvent),
3620 #[cfg(feature = "registered_events")]
3621 (®_evt_rdtube, Token::RegisteredEvent),
3622 ])
3623 .context("failed to build wait context")?;
3624
3625 if let Some(socket_server) = &control_server_socket {
3626 wait_ctx
3627 .add(socket_server, Token::VmControlServer)
3628 .context("failed to add descriptor to wait context")?;
3629 }
3630 let mut control_tubes = BTreeMap::from_iter(control_tubes.into_iter().enumerate());
3631 let mut next_control_id = control_tubes.len();
3632 for (id, socket) in control_tubes.iter() {
3633 wait_ctx
3634 .add(socket.as_ref(), Token::VmControl { id: *id })
3635 .context("failed to add descriptor to wait context")?;
3636 }
3637
3638 #[cfg(feature = "balloon")]
3639 let mut balloon_tube = balloon_host_tube
3640 .map(|tube| -> Result<BalloonTube> {
3641 wait_ctx
3642 .add(&tube, Token::BalloonTube)
3643 .context("failed to add descriptor to wait context")?;
3644 Ok(BalloonTube::new(tube))
3645 })
3646 .transpose()
3647 .context("failed to create balloon tube")?;
3648
3649 if cfg.jail_config.is_some() {
3650 // Before starting VCPUs, in case we started with some capabilities, drop them all.
3651 drop_capabilities().context("failed to drop process capabilities")?;
3652 }
3653
3654 let (device_ctrl_tube, device_ctrl_resp) = Tube::pair().context("failed to create tube")?;
3655 // Create devices thread, and restore if a restore file exists.
3656 linux.devices_thread = match create_devices_worker_thread(
3657 linux.vm.get_memory().clone(),
3658 linux.io_bus.clone(),
3659 linux.mmio_bus.clone(),
3660 device_ctrl_resp,
3661 ) {
3662 Ok(join_handle) => Some(join_handle),
3663 Err(e) => {
3664 return Err(anyhow!("Failed to start devices thread: {}", e));
3665 }
3666 };
3667
3668 let mut vcpu_handles = Vec::with_capacity(linux.vcpu_count);
3669 let vcpu_thread_barrier = Arc::new(Barrier::new(linux.vcpu_count + 1));
3670
3671 if !linux
3672 .vm
3673 .get_hypervisor()
3674 .check_capability(HypervisorCap::ImmediateExit)
3675 {
3676 return Err(anyhow!(
3677 "missing required hypervisor capability ImmediateExit"
3678 ));
3679 }
3680
3681 vcpu::setup_vcpu_signal_handler()?;
3682
3683 let vcpus: Vec<Option<_>> = match linux.vcpus.take() {
3684 Some(vec) => vec.into_iter().map(Some).collect(),
3685 None => iter::repeat_with(|| None).take(linux.vcpu_count).collect(),
3686 };
3687 // Enable core scheduling before creating vCPUs so that the cookie will be
3688 // shared by all vCPU threads.
3689 // TODO(b/199312402): Avoid enabling core scheduling for the crosvm process
3690 // itself for even better performance. Only vCPUs need the feature.
3691 if cfg.core_scheduling && cfg.per_vm_core_scheduling {
3692 if let Err(e) = enable_core_scheduling() {
3693 error!("Failed to enable core scheduling: {}", e);
3694 }
3695 }
3696
3697 // The tasks file only exist on sysfs if CgroupV1 hierachies are enabled
3698 let vcpu_cgroup_tasks_file = match &cfg.vcpu_cgroup_path {
3699 None => None,
3700 Some(cgroup_path) => {
3701 // Move main process to cgroup_path
3702 match File::create(cgroup_path.join("tasks")) {
3703 Ok(file) => Some(file),
3704 Err(_) => {
3705 info!(
3706 "Unable to open tasks file in cgroup: {}, trying CgroupV2",
3707 cgroup_path.display()
3708 );
3709 None
3710 }
3711 }
3712 }
3713 };
3714
3715 // vCPU freq domains are currently only supported with CgroupsV2.
3716 let mut vcpu_cgroup_v2_files: std::collections::BTreeMap<usize, File> = BTreeMap::new();
3717 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
3718 for (vcpu_id, vcpu_domain_path) in vcpu_domain_paths.iter() {
3719 let vcpu_cgroup_v2_file = File::create(vcpu_domain_path.join("cgroup.threads"))
3720 .with_context(|| {
3721 format!(
3722 "failed to create vcpu-cgroup-path {}",
3723 vcpu_domain_path.join("cgroup.threads").display(),
3724 )
3725 })?;
3726 vcpu_cgroup_v2_files.insert(*vcpu_id, vcpu_cgroup_v2_file);
3727 }
3728
3729 #[cfg(target_arch = "x86_64")]
3730 let bus_lock_ratelimit_ctrl: Arc<Mutex<Ratelimit>> = Arc::new(Mutex::new(Ratelimit::new()));
3731 #[cfg(target_arch = "x86_64")]
3732 if cfg.bus_lock_ratelimit > 0 {
3733 let bus_lock_ratelimit = cfg.bus_lock_ratelimit;
3734 if linux.vm.check_capability(VmCap::BusLockDetect) {
3735 info!("Hypervisor support bus lock detect");
3736 linux
3737 .vm
3738 .enable_capability(VmCap::BusLockDetect, 0)
3739 .expect("kvm: Failed to enable bus lock detection cap");
3740 info!("Hypervisor enabled bus lock detect");
3741 bus_lock_ratelimit_ctrl
3742 .lock()
3743 .ratelimit_set_speed(bus_lock_ratelimit);
3744 } else {
3745 bail!("Kvm: bus lock detection unsuported");
3746 }
3747 }
3748
3749 #[cfg(target_os = "android")]
3750 android::set_process_profiles(&cfg.task_profiles)?;
3751
3752 #[allow(unused_mut)]
3753 let mut run_mode = if cfg.suspended {
3754 // Sleep devices before creating vcpus.
3755 device_ctrl_tube
3756 .send(&DeviceControlCommand::SleepDevices)
3757 .context("send command to devices control socket")?;
3758 match device_ctrl_tube
3759 .recv()
3760 .context("receive from devices control socket")?
3761 {
3762 VmResponse::Ok => (),
3763 resp => bail!("device sleep failed: {}", resp),
3764 }
3765 VmRunMode::Suspending
3766 } else {
3767 VmRunMode::Running
3768 };
3769 #[cfg(feature = "gdb")]
3770 if to_gdb_channel.is_some() {
3771 // Wait until a GDB client attaches
3772 run_mode = VmRunMode::Breakpoint;
3773 }
3774 // If we are restoring from a snapshot, then start suspended.
3775 let (run_mode, post_restore_run_mode) = if cfg.restore_path.is_some() {
3776 (VmRunMode::Suspending, run_mode)
3777 } else {
3778 (run_mode, run_mode)
3779 };
3780
3781 // Architecture-specific code must supply a vcpu_init element for each VCPU.
3782 assert_eq!(vcpus.len(), linux.vcpu_init.len());
3783
3784 let (vcpu_pid_tid_sender, vcpu_pid_tid_receiver) = mpsc::channel();
3785 for ((cpu_id, vcpu), vcpu_init) in vcpus.into_iter().enumerate().zip(linux.vcpu_init.drain(..))
3786 {
3787 let vcpu_cgroup_file: Option<File>;
3788 if let Some(cgroup_file) = &vcpu_cgroup_tasks_file {
3789 vcpu_cgroup_file = Some(cgroup_file.try_clone().unwrap())
3790 } else if !cfg.cpu_freq_domains.is_empty() {
3791 vcpu_cgroup_file = Some(
3792 (vcpu_cgroup_v2_files.remove(&cpu_id).unwrap())
3793 .try_clone()
3794 .unwrap(),
3795 )
3796 } else {
3797 vcpu_cgroup_file = None
3798 };
3799
3800 let (to_vcpu_channel, from_main_channel) = mpsc::channel();
3801 let vcpu_affinity = match linux.vcpu_affinity.clone() {
3802 Some(VcpuAffinity::Global(v)) => v,
3803 Some(VcpuAffinity::PerVcpu(mut m)) => m.remove(&cpu_id).unwrap_or_default(),
3804 None => Default::default(),
3805 };
3806
3807 #[cfg(target_arch = "x86_64")]
3808 let vcpu_hybrid_type = if !cfg.vcpu_hybrid_type.is_empty() {
3809 Some(*cfg.vcpu_hybrid_type.get(&cpu_id).unwrap())
3810 } else {
3811 None
3812 };
3813
3814 #[cfg(target_arch = "x86_64")]
3815 let cpu_config = Some(CpuConfigX86_64::new(
3816 cfg.force_calibrated_tsc_leaf,
3817 cfg.host_cpu_topology,
3818 cfg.enable_hwp,
3819 cfg.no_smt,
3820 cfg.itmt,
3821 vcpu_hybrid_type,
3822 ));
3823 #[cfg(target_arch = "x86_64")]
3824 let bus_lock_ratelimit_ctrl = Arc::clone(&bus_lock_ratelimit_ctrl);
3825
3826 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
3827 let cpu_config = None;
3828
3829 #[cfg(target_arch = "riscv64")]
3830 let cpu_config = Some(CpuConfigRiscv64::new(vcpu_init.fdt_address));
3831
3832 let handle = vcpu::run_vcpu(
3833 cpu_id,
3834 vcpu_ids[cpu_id],
3835 vcpu,
3836 vcpu_init,
3837 linux.vm.try_clone().context("failed to clone vm")?,
3838 linux
3839 .irq_chip
3840 .try_box_clone()
3841 .context("failed to clone irqchip")?,
3842 linux.vcpu_count,
3843 linux.rt_cpus.contains(&cpu_id),
3844 vcpu_affinity,
3845 linux.delay_rt,
3846 vcpu_thread_barrier.clone(),
3847 (*linux.io_bus).clone(),
3848 (*linux.mmio_bus).clone(),
3849 vm_evt_wrtube
3850 .try_clone()
3851 .context("failed to clone vm event tube")?,
3852 from_main_channel,
3853 #[cfg(feature = "gdb")]
3854 to_gdb_channel.clone(),
3855 cfg.core_scheduling,
3856 cfg.per_vm_core_scheduling,
3857 cpu_config,
3858 match vcpu_cgroup_file {
3859 None => None,
3860 Some(ref f) => Some(
3861 f.try_clone()
3862 .context("failed to clone vcpu cgroup tasks file")?,
3863 ),
3864 },
3865 #[cfg(target_arch = "x86_64")]
3866 bus_lock_ratelimit_ctrl,
3867 run_mode,
3868 cfg.boost_uclamp,
3869 vcpu_pid_tid_sender.clone(),
3870 )?;
3871 vcpu_handles.push((handle, to_vcpu_channel));
3872 }
3873
3874 let mut vcpus_pid_tid = BTreeMap::new();
3875 for _ in 0..vcpu_handles.len() {
3876 let vcpu_pid_tid: VcpuPidTid = vcpu_pid_tid_receiver
3877 .recv()
3878 .context("failed receiving vcpu pid/tid")?;
3879 if vcpus_pid_tid
3880 .insert(
3881 vcpu_pid_tid.vcpu_id,
3882 (vcpu_pid_tid.process_id, vcpu_pid_tid.thread_id),
3883 )
3884 .is_some()
3885 {
3886 return Err(anyhow!(
3887 "Vcpu {} returned more than 1 PID and TID",
3888 vcpu_pid_tid.vcpu_id
3889 ));
3890 }
3891 }
3892
3893 #[cfg(feature = "gdb")]
3894 // Spawn GDB thread.
3895 if let Some((gdb_port_num, gdb_control_tube, from_vcpu_channel)) = gdb {
3896 let to_vcpu_channels = vcpu_handles
3897 .iter()
3898 .map(|(_handle, channel)| channel.clone())
3899 .collect();
3900 let target = GdbStub::new(gdb_control_tube, to_vcpu_channels, from_vcpu_channel);
3901 std::thread::Builder::new()
3902 .name("gdb".to_owned())
3903 .spawn(move || gdb_thread(target, gdb_port_num))
3904 .context("failed to spawn GDB thread")?;
3905 };
3906
3907 let (irq_handler_control, irq_handler_control_for_thread) = Tube::pair()?;
3908 let sys_allocator_for_thread = sys_allocator_mutex.clone();
3909 let irq_chip_for_thread = linux.irq_chip.try_box_clone()?;
3910 let irq_handler_thread = std::thread::Builder::new()
3911 .name("irq_handler_thread".into())
3912 .spawn(move || {
3913 irq_handler_thread(
3914 irq_control_tubes,
3915 irq_chip_for_thread,
3916 sys_allocator_for_thread,
3917 irq_handler_control_for_thread,
3918 )
3919 })
3920 .unwrap();
3921
3922 let (vm_memory_handler_control, vm_memory_handler_control_for_thread) = Tube::pair()?;
3923 let vm_memory_handler_thread = std::thread::Builder::new()
3924 .name("vm_memory_handler_thread".into())
3925 .spawn({
3926 let vm = linux.vm.try_clone().context("failed to clone Vm")?;
3927 let sys_allocator_mutex = sys_allocator_mutex.clone();
3928 let iommu_client = iommu_host_tube
3929 .as_ref()
3930 .map(|t| VmMemoryRequestIommuClient::new(t.clone()));
3931 move || {
3932 vm_memory_handler_thread(
3933 vm_memory_control_tubes,
3934 vm,
3935 sys_allocator_mutex,
3936 gralloc,
3937 iommu_client,
3938 vm_memory_handler_control_for_thread,
3939 )
3940 }
3941 })
3942 .unwrap();
3943
3944 vcpu_thread_barrier.wait();
3945
3946 // See comment on `VmRequest::execute`.
3947 let mut suspended_pvclock_state: Option<hypervisor::ClockState> = None;
3948
3949 // Restore VM (if applicable).
3950 // Must happen after the vCPU barrier to avoid deadlock.
3951 if let Some(path) = &cfg.restore_path {
3952 vm_control::do_restore(
3953 path,
3954 |msg| vcpu::kick_all_vcpus(&vcpu_handles, linux.irq_chip.as_irq_chip(), msg),
3955 |msg, index| {
3956 vcpu::kick_vcpu(&vcpu_handles.get(index), linux.irq_chip.as_irq_chip(), msg)
3957 },
3958 &irq_handler_control,
3959 &device_ctrl_tube,
3960 linux.vcpu_count,
3961 |image| {
3962 linux
3963 .irq_chip
3964 .try_box_clone()?
3965 .restore(image, linux.vcpu_count)
3966 },
3967 /* require_encrypted= */ false,
3968 &mut suspended_pvclock_state,
3969 &linux.vm,
3970 )?;
3971 // Allow the vCPUs to start for real.
3972 vcpu::kick_all_vcpus(
3973 &vcpu_handles,
3974 linux.irq_chip.as_irq_chip(),
3975 VcpuControl::RunState(post_restore_run_mode),
3976 )
3977 }
3978
3979 #[cfg(feature = "swap")]
3980 if let Some(swap_controller) = &swap_controller {
3981 swap_controller
3982 .on_static_devices_setup_complete()
3983 .context("static device setup complete")?;
3984 }
3985
3986 let metrics_thread = if metrics::is_initialized() {
3987 Some(
3988 std::thread::Builder::new()
3989 .name("metrics_thread".into())
3990 .spawn(move || {
3991 if let Err(e) = MetricsController::new(vec![metrics_tube]).run() {
3992 error!("Metrics controller error: {:?}", e);
3993 }
3994 })
3995 .context("metrics thread failed")?,
3996 )
3997 } else {
3998 None
3999 };
4000
4001 let mut exit_state = ExitState::Stop;
4002 let mut pvpanic_code = PvPanicCode::Unknown;
4003 #[cfg(feature = "registered_events")]
4004 let mut registered_evt_tubes: HashMap<RegisteredEvent, HashSet<AddressedProtoTube>> =
4005 HashMap::new();
4006
4007 'wait: loop {
4008 let events = {
4009 match wait_ctx.wait() {
4010 Ok(v) => v,
4011 Err(e) => {
4012 error!("failed to poll: {}", e);
4013 break;
4014 }
4015 }
4016 };
4017
4018 let mut vm_control_ids_to_remove = Vec::new();
4019 for event in events.iter().filter(|e| e.is_readable) {
4020 match event.token {
4021 #[cfg(feature = "registered_events")]
4022 Token::RegisteredEvent => match reg_evt_rdtube.recv::<RegisteredEventWithData>() {
4023 Ok(reg_evt) => {
4024 let evt = reg_evt.into_event();
4025 let mut tubes_to_remove: Vec<String> = Vec::new();
4026 if let Some(tubes) = registered_evt_tubes.get_mut(&evt) {
4027 for tube in tubes.iter() {
4028 if let Err(e) = tube.send(®_evt.into_proto()) {
4029 warn!(
4030 "failed to send registered event {:?} to {}, removing from \
4031 registrations: {}",
4032 reg_evt, tube.socket_addr, e
4033 );
4034 tubes_to_remove.push(tube.socket_addr.clone());
4035 }
4036 }
4037 }
4038 for tube_addr in tubes_to_remove {
4039 for tubes in registered_evt_tubes.values_mut() {
4040 tubes.retain(|t| t.socket_addr != tube_addr);
4041 }
4042 }
4043 registered_evt_tubes.retain(|_, tubes| !tubes.is_empty());
4044 }
4045 Err(e) => {
4046 warn!("failed to recv RegisteredEvent: {}", e);
4047 }
4048 },
4049 Token::VmEvent => {
4050 let mut break_to_wait: bool = true;
4051 match vm_evt_rdtube.recv::<VmEventType>() {
4052 Ok(vm_event) => match vm_event {
4053 VmEventType::Exit => {
4054 info!("vcpu requested shutdown");
4055 exit_state = ExitState::Stop;
4056 }
4057 VmEventType::Reset => {
4058 info!("vcpu requested reset");
4059 exit_state = ExitState::Reset;
4060 }
4061 VmEventType::Crash => {
4062 info!("vcpu crashed");
4063 exit_state = ExitState::Crash;
4064 }
4065 VmEventType::Panic(panic_code) => {
4066 pvpanic_code = PvPanicCode::from_u8(panic_code);
4067 info!("Guest reported panic [Code: {}]", pvpanic_code);
4068 break_to_wait = false;
4069 }
4070 VmEventType::WatchdogReset => {
4071 info!("vcpu stall detected");
4072 exit_state = ExitState::WatchdogReset;
4073 }
4074 },
4075 Err(e) => {
4076 warn!("failed to recv VmEvent: {}", e);
4077 }
4078 }
4079 if break_to_wait {
4080 if pvpanic_code == PvPanicCode::Panicked {
4081 exit_state = ExitState::GuestPanic;
4082 }
4083 break 'wait;
4084 }
4085 }
4086 Token::Suspend => match linux.suspend_tube.1.recv::<bool>() {
4087 Ok(is_suspend_request) => {
4088 let mode = if is_suspend_request {
4089 VmRunMode::Suspending
4090 } else {
4091 for dev in &linux.resume_notify_devices {
4092 dev.lock().resume_imminent();
4093 }
4094 VmRunMode::Running
4095 };
4096 info!("VM requested {}", mode);
4097 vcpu::kick_all_vcpus(
4098 &vcpu_handles,
4099 linux.irq_chip.as_irq_chip(),
4100 VcpuControl::RunState(mode),
4101 );
4102 }
4103 Err(err) => {
4104 warn!("Failed to read suspend tube {:?}", err);
4105 }
4106 },
4107 Token::ChildSignal => {
4108 // Print all available siginfo structs, then exit the loop if child process has
4109 // been exited except CLD_STOPPED and CLD_CONTINUED. the two should be ignored
4110 // here since they are used by the vmm-swap feature.
4111 let mut do_exit = false;
4112 while let Some(siginfo) =
4113 sigchld_fd.read().context("failed to read signalfd")?
4114 {
4115 let pid = siginfo.ssi_pid;
4116 let pid_label = match linux.pid_debug_label_map.get(&pid) {
4117 Some(label) => format!("{} (pid {})", label, pid),
4118 None => format!("pid {}", pid),
4119 };
4120
4121 // TODO(kawasin): this is a temporary exception until device suspension.
4122 #[cfg(feature = "swap")]
4123 if siginfo.ssi_code == libc::CLD_STOPPED
4124 || siginfo.ssi_code == libc::CLD_CONTINUED
4125 {
4126 continue;
4127 }
4128
4129 // Ignore clean exits of non-tracked child processes when running without
4130 // sandboxing. The virtio gpu process launches a render server for
4131 // pass-through graphics. Host GPU drivers have been observed to fork
4132 // child processes that exit cleanly which should not be considered a
4133 // crash. When running with sandboxing, this should be handled by the
4134 // device's process handler.
4135 if cfg.jail_config.is_none()
4136 && !linux.pid_debug_label_map.contains_key(&pid)
4137 && siginfo.ssi_signo == libc::SIGCHLD as u32
4138 && siginfo.ssi_code == libc::CLD_EXITED
4139 && siginfo.ssi_status == 0
4140 {
4141 continue;
4142 }
4143
4144 // Allow clean exits of a child process in `worker_process_pids`.
4145 if siginfo.ssi_signo == libc::SIGCHLD as u32
4146 && siginfo.ssi_code == libc::CLD_EXITED
4147 && siginfo.ssi_status == 0
4148 && worker_process_pids.remove(&(pid as Pid))
4149 {
4150 info!("child {pid} exited successfully");
4151 continue;
4152 }
4153
4154 error!(
4155 "child {} exited: signo {}, status {}, code {}",
4156 pid_label, siginfo.ssi_signo, siginfo.ssi_status, siginfo.ssi_code
4157 );
4158 do_exit = true;
4159 }
4160 if do_exit {
4161 exit_state = ExitState::Crash;
4162 break 'wait;
4163 }
4164 }
4165 Token::VmControlServer => {
4166 if let Some(socket_server) = &control_server_socket {
4167 match socket_server.accept() {
4168 Ok(socket) => {
4169 let id = next_control_id;
4170 next_control_id += 1;
4171 wait_ctx
4172 .add(&socket, Token::VmControl { id })
4173 .context("failed to add descriptor to wait context")?;
4174 control_tubes
4175 .insert(id, TaggedControlTube::Vm(Tube::try_from(socket)?));
4176 }
4177 Err(e) => error!("failed to accept socket: {}", e),
4178 }
4179 }
4180 }
4181 Token::VmControl { id } => {
4182 if let Some(socket) = control_tubes.get(&id) {
4183 let mut state = ControlLoopState {
4184 linux: &mut linux,
4185 cfg: &cfg,
4186 sys_allocator: &sys_allocator_mutex,
4187 control_tubes: &control_tubes,
4188 disk_host_tubes: &disk_host_tubes[..],
4189 #[cfg(feature = "audio")]
4190 snd_host_tubes: &snd_host_tubes[..],
4191 #[cfg(feature = "gpu")]
4192 gpu_control_tube: gpu_control_tube.as_ref(),
4193 #[cfg(feature = "usb")]
4194 usb_control_tube: &usb_control_tube,
4195 #[cfg(target_arch = "x86_64")]
4196 iommu_host_tube: &iommu_host_tube,
4197 #[cfg(target_arch = "x86_64")]
4198 hp_control_tube: &hp_control_tube,
4199 guest_suspended_cvar: &guest_suspended_cvar,
4200 #[cfg(feature = "pci-hotplug")]
4201 hotplug_manager: &mut hotplug_manager,
4202 #[cfg(feature = "swap")]
4203 swap_controller: &mut swap_controller,
4204 vcpu_handles: &vcpu_handles,
4205 #[cfg(feature = "balloon")]
4206 balloon_tube: balloon_tube.as_mut(),
4207 device_ctrl_tube: &device_ctrl_tube,
4208 irq_handler_control: &irq_handler_control,
4209 #[cfg(any(target_arch = "x86_64", feature = "pci-hotplug"))]
4210 vm_memory_handler_control: &vm_memory_handler_control,
4211 #[cfg(feature = "registered_events")]
4212 registered_evt_tubes: &mut registered_evt_tubes,
4213 #[cfg(feature = "pvclock")]
4214 pvclock_host_tube: pvclock_host_tube.clone(),
4215 vfio_container_manager: &mut vfio_container_manager,
4216 suspended_pvclock_state: &mut suspended_pvclock_state,
4217 vcpus_pid_tid: &vcpus_pid_tid,
4218 };
4219 let (exit_requested, mut ids_to_remove, add_tubes) =
4220 process_vm_control_event(&mut state, id, socket)?;
4221 if exit_requested {
4222 break 'wait;
4223 }
4224 vm_control_ids_to_remove.append(&mut ids_to_remove);
4225 for socket in add_tubes {
4226 let id = next_control_id;
4227 next_control_id += 1;
4228 wait_ctx
4229 .add(socket.as_ref(), Token::VmControl { id })
4230 .context(
4231 "failed to add hotplug vfio-pci descriptor to wait context",
4232 )?;
4233 control_tubes.insert(id, socket);
4234 }
4235 }
4236 }
4237 #[cfg(feature = "balloon")]
4238 Token::BalloonTube => {
4239 match balloon_tube.as_mut().expect("missing balloon tube").recv() {
4240 Ok(resp) => {
4241 for (resp, idx) in resp {
4242 if let Some(TaggedControlTube::Vm(tube)) = control_tubes.get(&idx) {
4243 if let Err(e) = tube.send(&resp) {
4244 error!("failed to send VmResponse: {}", e);
4245 }
4246 } else {
4247 error!("Bad tube index {}", idx);
4248 }
4249 }
4250 }
4251 Err(err) => {
4252 error!("Error processing balloon tube {:?}", err)
4253 }
4254 }
4255 }
4256 }
4257 }
4258
4259 remove_hungup_and_drained_tubes(
4260 &events,
4261 &wait_ctx,
4262 &mut control_tubes,
4263 vm_control_ids_to_remove,
4264 |token: &Token| {
4265 if let Token::VmControl { id } = token {
4266 return Some(*id);
4267 }
4268 None
4269 },
4270 )?;
4271 }
4272
4273 vcpu::kick_all_vcpus(
4274 &vcpu_handles,
4275 linux.irq_chip.as_irq_chip(),
4276 VcpuControl::RunState(VmRunMode::Exiting),
4277 );
4278 for (handle, _) in vcpu_handles {
4279 if let Err(e) = handle.join() {
4280 error!("failed to join vcpu thread: {:?}", e);
4281 }
4282 }
4283
4284 // After joining all vcpu threads, unregister the process-wide signal handler.
4285 if let Err(e) = vcpu::remove_vcpu_signal_handler() {
4286 error!("failed to remove vcpu thread signal handler: {:#}", e);
4287 }
4288
4289 // Stop the vmm-swap monitor process.
4290 #[cfg(feature = "swap")]
4291 drop(swap_controller);
4292
4293 // Stop pci root worker thread
4294 #[cfg(target_arch = "x86_64")]
4295 {
4296 let _ = hp_control_tube.send(PciRootCommand::Kill);
4297 if let Err(e) = hp_thread.join() {
4298 error!("failed to join hotplug thread: {:?}", e);
4299 }
4300 }
4301
4302 if linux.devices_thread.is_some() {
4303 if let Err(e) = device_ctrl_tube.send(&DeviceControlCommand::Exit) {
4304 error!("failed to stop device control loop: {}", e);
4305 };
4306 if let Some(thread) = linux.devices_thread.take() {
4307 if let Err(e) = thread.join() {
4308 error!("failed to exit devices thread: {:?}", e);
4309 }
4310 }
4311 }
4312
4313 // Shut down the VM Memory handler thread.
4314 if let Err(e) = vm_memory_handler_control.send(&VmMemoryHandlerRequest::Exit) {
4315 error!(
4316 "failed to request exit from VM Memory handler thread: {}",
4317 e
4318 );
4319 }
4320 if let Err(e) = vm_memory_handler_thread.join() {
4321 error!("failed to exit VM Memory handler thread: {:?}", e);
4322 }
4323
4324 // Shut down the IRQ handler thread.
4325 if let Err(e) = irq_handler_control.send(&IrqHandlerRequest::Exit) {
4326 error!("failed to request exit from IRQ handler thread: {}", e);
4327 }
4328 if let Err(e) = irq_handler_thread.join() {
4329 error!("failed to exit irq handler thread: {:?}", e);
4330 }
4331
4332 // At this point, the only remaining `Arc` references to the `Bus` objects should be the ones
4333 // inside `linux`. If the checks below fail, then some other thread is probably still running
4334 // and needs to be explicitly stopped before dropping `linux` to ensure devices actually get
4335 // cleaned up.
4336 match Arc::try_unwrap(std::mem::replace(
4337 &mut linux.mmio_bus,
4338 Arc::new(Bus::new(BusType::Mmio)),
4339 )) {
4340 Ok(_) => {}
4341 Err(_) => panic!("internal error: mmio_bus had more than one reference at shutdown"),
4342 }
4343 match Arc::try_unwrap(std::mem::replace(
4344 &mut linux.io_bus,
4345 Arc::new(Bus::new(BusType::Io)),
4346 )) {
4347 Ok(_) => {}
4348 Err(_) => panic!("internal error: io_bus had more than one reference at shutdown"),
4349 }
4350
4351 // Explicitly drop the VM structure here to allow the devices to clean up before the
4352 // control sockets are closed when this function exits.
4353 mem::drop(linux);
4354
4355 // Drop the hotplug manager to tell the warden process to exit before we try to join
4356 // the metrics thread.
4357 #[cfg(feature = "pci-hotplug")]
4358 mem::drop(hotplug_manager);
4359
4360 // All our children should have exited by now, so closing our fd should
4361 // terminate metrics. Then join so that everything gets flushed.
4362 metrics::get_destructor().cleanup();
4363 if let Some(metrics_thread) = metrics_thread {
4364 if let Err(e) = metrics_thread.join() {
4365 error!("failed to exit irq handler thread: {:?}", e);
4366 }
4367 }
4368
4369 stdin()
4370 .set_canon_mode()
4371 .expect("failed to restore canonical mode for terminal");
4372
4373 Ok(exit_state)
4374 }
4375
4376 #[derive(EventToken)]
4377 enum IrqHandlerToken {
4378 IrqFd { index: IrqEventIndex },
4379 VmIrq { id: usize },
4380 DelayedIrqFd,
4381 HandlerControl,
4382 }
4383
4384 /// Handles IRQs and requests from devices to add additional IRQ lines.
irq_handler_thread( irq_control_tubes: Vec<Tube>, mut irq_chip: Box<dyn IrqChipArch + 'static>, sys_allocator_mutex: Arc<Mutex<SystemAllocator>>, handler_control: Tube, ) -> anyhow::Result<()>4385 fn irq_handler_thread(
4386 irq_control_tubes: Vec<Tube>,
4387 mut irq_chip: Box<dyn IrqChipArch + 'static>,
4388 sys_allocator_mutex: Arc<Mutex<SystemAllocator>>,
4389 handler_control: Tube,
4390 ) -> anyhow::Result<()> {
4391 let wait_ctx = WaitContext::build_with(&[(
4392 handler_control.get_read_notifier(),
4393 IrqHandlerToken::HandlerControl,
4394 )])
4395 .context("failed to build wait context")?;
4396
4397 if let Some(delayed_ioapic_irq_trigger) = irq_chip.irq_delayed_event_token()? {
4398 wait_ctx
4399 .add(&delayed_ioapic_irq_trigger, IrqHandlerToken::DelayedIrqFd)
4400 .context("failed to add descriptor to wait context")?;
4401 }
4402
4403 let mut irq_event_tokens = irq_chip
4404 .irq_event_tokens()
4405 .context("failed get event tokens from irqchip")?;
4406
4407 for (index, _gsi, evt) in irq_event_tokens.iter() {
4408 wait_ctx
4409 .add(evt, IrqHandlerToken::IrqFd { index: *index })
4410 .context("failed to add irq chip event tokens to wait context")?;
4411 }
4412
4413 let mut irq_control_tubes = BTreeMap::from_iter(irq_control_tubes.into_iter().enumerate());
4414 let mut next_control_id = irq_control_tubes.len();
4415 for (id, socket) in irq_control_tubes.iter() {
4416 wait_ctx
4417 .add(
4418 socket.get_read_notifier(),
4419 IrqHandlerToken::VmIrq { id: *id },
4420 )
4421 .context("irq control tubes to wait context")?;
4422 }
4423
4424 'wait: loop {
4425 let events = {
4426 match wait_ctx.wait() {
4427 Ok(v) => v,
4428 Err(e) => {
4429 error!("failed to poll: {}", e);
4430 break 'wait;
4431 }
4432 }
4433 };
4434 let token_count = events.len();
4435 let mut vm_irq_tubes_to_remove = Vec::new();
4436 let mut notify_control_on_iteration_end = false;
4437
4438 for event in events.iter().filter(|e| e.is_readable) {
4439 match event.token {
4440 IrqHandlerToken::HandlerControl => {
4441 match handler_control.recv::<IrqHandlerRequest>() {
4442 Ok(request) => {
4443 match request {
4444 IrqHandlerRequest::Exit => break 'wait,
4445 IrqHandlerRequest::AddIrqControlTubes(tubes) => {
4446 for socket in tubes {
4447 let id = next_control_id;
4448 next_control_id += 1;
4449 wait_ctx
4450 .add(
4451 socket.get_read_notifier(),
4452 IrqHandlerToken::VmIrq { id },
4453 )
4454 .context("failed to add new IRQ control Tube to wait context")?;
4455 irq_control_tubes.insert(id, socket);
4456 }
4457 }
4458 IrqHandlerRequest::RefreshIrqEventTokens => {
4459 for (_index, _gsi, evt) in irq_event_tokens.iter() {
4460 wait_ctx.delete(evt).context(
4461 "failed to remove irq chip event \
4462 token from wait context",
4463 )?;
4464 }
4465
4466 irq_event_tokens = irq_chip
4467 .irq_event_tokens()
4468 .context("failed get event tokens from irqchip")?;
4469 for (index, _gsi, evt) in irq_event_tokens.iter() {
4470 wait_ctx
4471 .add(evt, IrqHandlerToken::IrqFd { index: *index })
4472 .context(
4473 "failed to add irq chip event \
4474 tokens to wait context",
4475 )?;
4476 }
4477
4478 if let Err(e) = handler_control
4479 .send(&IrqHandlerResponse::IrqEventTokenRefreshComplete)
4480 {
4481 error!(
4482 "failed to notify IRQ event token refresh \
4483 was completed: {}",
4484 e
4485 );
4486 }
4487 }
4488 IrqHandlerRequest::WakeAndNotifyIteration => {
4489 notify_control_on_iteration_end = true;
4490 }
4491 }
4492 }
4493 Err(e) => {
4494 if let TubeError::Disconnected = e {
4495 panic!("irq handler control tube disconnected.");
4496 } else {
4497 error!("failed to recv IrqHandlerRequest: {}", e);
4498 }
4499 }
4500 }
4501 }
4502 IrqHandlerToken::VmIrq { id } => {
4503 if let Some(tube) = irq_control_tubes.get(&id) {
4504 handle_irq_tube_request(
4505 &sys_allocator_mutex,
4506 &mut irq_chip,
4507 &mut vm_irq_tubes_to_remove,
4508 &wait_ctx,
4509 tube,
4510 id,
4511 );
4512 }
4513 }
4514 IrqHandlerToken::IrqFd { index } => {
4515 if let Err(e) = irq_chip.service_irq_event(index) {
4516 error!("failed to signal irq {}: {}", index, e);
4517 }
4518 }
4519 IrqHandlerToken::DelayedIrqFd => {
4520 if let Err(e) = irq_chip.process_delayed_irq_events() {
4521 warn!("can't deliver delayed irqs: {}", e);
4522 }
4523 }
4524 }
4525 }
4526
4527 if notify_control_on_iteration_end {
4528 if let Err(e) = handler_control.send(&IrqHandlerResponse::HandlerIterationComplete(
4529 token_count - 1,
4530 )) {
4531 error!(
4532 "failed to notify on iteration completion (snapshotting may fail): {}",
4533 e
4534 );
4535 }
4536 }
4537
4538 remove_hungup_and_drained_tubes(
4539 &events,
4540 &wait_ctx,
4541 &mut irq_control_tubes,
4542 vm_irq_tubes_to_remove,
4543 |token: &IrqHandlerToken| {
4544 if let IrqHandlerToken::VmIrq { id } = token {
4545 return Some(*id);
4546 }
4547 None
4548 },
4549 )?;
4550 if events.iter().any(|e| {
4551 e.is_hungup && !e.is_readable && matches!(e.token, IrqHandlerToken::HandlerControl)
4552 }) {
4553 error!("IRQ handler control hung up but did not request an exit.");
4554 break 'wait;
4555 }
4556 }
4557 Ok(())
4558 }
4559
handle_irq_tube_request( sys_allocator_mutex: &Arc<Mutex<SystemAllocator>>, irq_chip: &mut Box<dyn IrqChipArch + 'static>, vm_irq_tubes_to_remove: &mut Vec<usize>, wait_ctx: &WaitContext<IrqHandlerToken>, tube: &Tube, tube_index: usize, )4560 fn handle_irq_tube_request(
4561 sys_allocator_mutex: &Arc<Mutex<SystemAllocator>>,
4562 irq_chip: &mut Box<dyn IrqChipArch + 'static>,
4563 vm_irq_tubes_to_remove: &mut Vec<usize>,
4564 wait_ctx: &WaitContext<IrqHandlerToken>,
4565 tube: &Tube,
4566 tube_index: usize,
4567 ) {
4568 match tube.recv::<VmIrqRequest>() {
4569 Ok(request) => {
4570 let response = {
4571 request.execute(
4572 |setup| match setup {
4573 IrqSetup::Event(irq, ev, device_id, queue_id, device_name) => {
4574 let irq_evt = devices::IrqEdgeEvent::from_event(ev.try_clone()?);
4575 let source = IrqEventSource {
4576 device_id: device_id.try_into().expect("Invalid device_id"),
4577 queue_id,
4578 device_name,
4579 };
4580 if let Some(event_index) =
4581 irq_chip.register_edge_irq_event(irq, &irq_evt, source)?
4582 {
4583 if let Err(e) =
4584 wait_ctx.add(ev, IrqHandlerToken::IrqFd { index: event_index })
4585 {
4586 warn!("failed to add IrqFd to poll context: {}", e);
4587 return Err(e);
4588 }
4589 }
4590 Ok(())
4591 }
4592 IrqSetup::Route(route) => irq_chip.route_irq(route),
4593 IrqSetup::UnRegister(irq, ev) => {
4594 let irq_evt = devices::IrqEdgeEvent::from_event(ev.try_clone()?);
4595 irq_chip.unregister_edge_irq_event(irq, &irq_evt)
4596 }
4597 },
4598 &mut sys_allocator_mutex.lock(),
4599 )
4600 };
4601 if let Err(e) = tube.send(&response) {
4602 error!("failed to send VmIrqResponse: {}", e);
4603 }
4604 }
4605 Err(e) => {
4606 if let TubeError::Disconnected = e {
4607 vm_irq_tubes_to_remove.push(tube_index);
4608 } else {
4609 error!("failed to recv VmIrqRequest: {}", e);
4610 }
4611 }
4612 }
4613 }
4614
4615 /// Commands to control the VM Memory handler thread.
4616 #[derive(serde::Serialize, serde::Deserialize)]
4617 pub enum VmMemoryHandlerRequest {
4618 /// No response is sent for this command.
4619 AddControlTubes(Vec<VmMemoryTube>),
4620 /// No response is sent for this command.
4621 Exit,
4622 }
4623
vm_memory_handler_thread( control_tubes: Vec<VmMemoryTube>, mut vm: impl Vm, sys_allocator_mutex: Arc<Mutex<SystemAllocator>>, mut gralloc: RutabagaGralloc, mut iommu_client: Option<VmMemoryRequestIommuClient>, handler_control: Tube, ) -> anyhow::Result<()>4624 fn vm_memory_handler_thread(
4625 control_tubes: Vec<VmMemoryTube>,
4626 mut vm: impl Vm,
4627 sys_allocator_mutex: Arc<Mutex<SystemAllocator>>,
4628 mut gralloc: RutabagaGralloc,
4629 mut iommu_client: Option<VmMemoryRequestIommuClient>,
4630 handler_control: Tube,
4631 ) -> anyhow::Result<()> {
4632 #[derive(EventToken)]
4633 enum Token {
4634 VmControl { id: usize },
4635 HandlerControl,
4636 }
4637
4638 let wait_ctx =
4639 WaitContext::build_with(&[(handler_control.get_read_notifier(), Token::HandlerControl)])
4640 .context("failed to build wait context")?;
4641 let mut control_tubes = BTreeMap::from_iter(control_tubes.into_iter().enumerate());
4642 let mut next_control_id = control_tubes.len();
4643 for (id, socket) in control_tubes.iter() {
4644 wait_ctx
4645 .add(socket.as_ref(), Token::VmControl { id: *id })
4646 .context("failed to add descriptor to wait context")?;
4647 }
4648
4649 let mut region_state: VmMemoryRegionState = Default::default();
4650
4651 'wait: loop {
4652 let events = {
4653 match wait_ctx.wait() {
4654 Ok(v) => v,
4655 Err(e) => {
4656 error!("failed to poll: {}", e);
4657 break;
4658 }
4659 }
4660 };
4661
4662 let mut vm_control_ids_to_remove = Vec::new();
4663 for event in events.iter().filter(|e| e.is_readable) {
4664 match event.token {
4665 Token::HandlerControl => match handler_control.recv::<VmMemoryHandlerRequest>() {
4666 Ok(request) => match request {
4667 VmMemoryHandlerRequest::Exit => break 'wait,
4668 VmMemoryHandlerRequest::AddControlTubes(tubes) => {
4669 for socket in tubes {
4670 let id = next_control_id;
4671 next_control_id += 1;
4672 wait_ctx
4673 .add(socket.get_read_notifier(), Token::VmControl { id })
4674 .context(
4675 "failed to add new vm memory control Tube to wait context",
4676 )?;
4677 control_tubes.insert(id, socket);
4678 }
4679 }
4680 },
4681 Err(e) => {
4682 if let TubeError::Disconnected = e {
4683 panic!("vm memory control tube disconnected.");
4684 } else {
4685 error!("failed to recv VmMemoryHandlerRequest: {}", e);
4686 }
4687 }
4688 },
4689 Token::VmControl { id } => {
4690 if let Some(VmMemoryTube {
4691 tube,
4692 expose_with_viommu,
4693 }) = control_tubes.get(&id)
4694 {
4695 match tube.recv::<VmMemoryRequest>() {
4696 Ok(request) => {
4697 let response = request.execute(
4698 tube,
4699 &mut vm,
4700 &mut sys_allocator_mutex.lock(),
4701 &mut gralloc,
4702 if *expose_with_viommu {
4703 iommu_client.as_mut()
4704 } else {
4705 None
4706 },
4707 &mut region_state,
4708 );
4709 if let Err(e) = tube.send(&response) {
4710 error!("failed to send VmMemoryControlResponse: {}", e);
4711 }
4712 }
4713 Err(e) => {
4714 if let TubeError::Disconnected = e {
4715 vm_control_ids_to_remove.push(id);
4716 } else {
4717 error!("failed to recv VmMemoryControlRequest: {}", e);
4718 }
4719 }
4720 }
4721 }
4722 }
4723 }
4724 }
4725
4726 remove_hungup_and_drained_tubes(
4727 &events,
4728 &wait_ctx,
4729 &mut control_tubes,
4730 vm_control_ids_to_remove,
4731 |token: &Token| {
4732 if let Token::VmControl { id } = token {
4733 return Some(*id);
4734 }
4735 None
4736 },
4737 )?;
4738 if events
4739 .iter()
4740 .any(|e| e.is_hungup && !e.is_readable && matches!(e.token, Token::HandlerControl))
4741 {
4742 error!("vm memory handler control hung up but did not request an exit.");
4743 break 'wait;
4744 }
4745 }
4746 Ok(())
4747 }
4748
4749 /// When control tubes hang up, we want to make sure that we've fully drained
4750 /// the underlying socket before removing it. This function also handles
4751 /// removing closed sockets in such a way that avoids phantom events.
4752 ///
4753 /// `tube_ids_to_remove` is the set of ids that we already know should
4754 /// be removed (e.g. from getting a disconnect error on read).
remove_hungup_and_drained_tubes<T, U>( events: &SmallVec<[TriggeredEvent<T>; 16]>, wait_ctx: &WaitContext<T>, tubes: &mut BTreeMap<usize, U>, mut tube_ids_to_remove: Vec<usize>, get_tube_id: fn(token: &T) -> Option<usize>, ) -> anyhow::Result<()> where T: EventToken, U: ReadNotifier,4755 fn remove_hungup_and_drained_tubes<T, U>(
4756 events: &SmallVec<[TriggeredEvent<T>; 16]>,
4757 wait_ctx: &WaitContext<T>,
4758 tubes: &mut BTreeMap<usize, U>,
4759 mut tube_ids_to_remove: Vec<usize>,
4760 get_tube_id: fn(token: &T) -> Option<usize>,
4761 ) -> anyhow::Result<()>
4762 where
4763 T: EventToken,
4764 U: ReadNotifier,
4765 {
4766 // It's possible more data is readable and buffered while the socket is hungup,
4767 // so don't delete the tube from the poll context until we're sure all the
4768 // data is read.
4769 // Below case covers a condition where we have received a hungup event and the tube is not
4770 // readable.
4771 // In case of readable tube, once all data is read, any attempt to read more data on hungup
4772 // tube should fail. On such failure, we get Disconnected error and ids gets added to
4773 // tube_ids_to_remove by the time we reach here.
4774 for event in events.iter().filter(|e| e.is_hungup && !e.is_readable) {
4775 if let Some(id) = get_tube_id(&event.token) {
4776 tube_ids_to_remove.push(id);
4777 }
4778 }
4779
4780 tube_ids_to_remove.dedup();
4781 for id in tube_ids_to_remove {
4782 // Delete the socket from the `wait_ctx` synchronously. Otherwise, the kernel will do
4783 // this automatically when the FD inserted into the `wait_ctx` is closed after this
4784 // if-block, but this removal can be deferred unpredictably. In some instances where the
4785 // system is under heavy load, we can even get events returned by `wait_ctx` for an FD
4786 // that has already been closed. Because the token associated with that spurious event
4787 // now belongs to a different socket, the control loop will start to interact with
4788 // sockets that might not be ready to use. This can cause incorrect hangup detection or
4789 // blocking on a socket that will never be ready. See also: crbug.com/1019986
4790 if let Some(socket) = tubes.remove(&id) {
4791 wait_ctx
4792 .delete(socket.get_read_notifier())
4793 .context("failed to remove descriptor from wait context")?;
4794 }
4795 }
4796 Ok(())
4797 }
4798
4799 /// Start and jail a vhost-user device according to its configuration and a vhost listener string.
4800 ///
4801 /// The jailing business is nasty and potentially unsafe if done from the wrong context - do not
4802 /// call outside of `start_devices`!
4803 ///
4804 /// Returns the pid of the jailed device process.
jail_and_start_vu_device<T: VirtioDeviceBuilder>( jail_config: Option<&JailConfig>, params: T, vhost: &str, name: &str, ) -> anyhow::Result<(libc::pid_t, Option<Box<dyn std::any::Any>>)>4805 fn jail_and_start_vu_device<T: VirtioDeviceBuilder>(
4806 jail_config: Option<&JailConfig>,
4807 params: T,
4808 vhost: &str,
4809 name: &str,
4810 ) -> anyhow::Result<(libc::pid_t, Option<Box<dyn std::any::Any>>)> {
4811 let mut keep_rds = Vec::new();
4812
4813 base::syslog::push_descriptors(&mut keep_rds);
4814 cros_tracing::push_descriptors!(&mut keep_rds);
4815 metrics::push_descriptors(&mut keep_rds);
4816
4817 let jail_type = VirtioDeviceType::VhostUser;
4818
4819 // Create a jail from the configuration. If the configuration is `None`, `create_jail` will also
4820 // return `None` so fall back to an empty (i.e. non-constrained) Minijail.
4821 let jail = params
4822 .create_jail(jail_config, jail_type)
4823 .with_context(|| format!("failed to create jail for {}", name))?
4824 .ok_or(())
4825 .or_else(|_| create_default_minijail())
4826 .with_context(|| format!("failed to create empty jail for {}", name))?;
4827
4828 // Create the device in the parent process, so the child does not need any privileges necessary
4829 // to do it (only runtime capabilities are required).
4830 let device = params
4831 .create_vhost_user_device(&mut keep_rds)
4832 .context("failed to create vhost-user device")?;
4833 let mut listener =
4834 VhostUserListener::new(vhost).context("failed to create the vhost listener")?;
4835 keep_rds.push(listener.as_raw_descriptor());
4836 let parent_resources = listener.take_parent_process_resources();
4837
4838 // Executor must be created before jail in order to prevent the jailed process from creating
4839 // unrestricted io_urings.
4840 let ex = Executor::new().context("Failed to create an Executor")?;
4841 keep_rds.extend(ex.as_raw_descriptors());
4842
4843 // Deduplicate the FDs since minijail expects them to be unique.
4844 keep_rds.sort_unstable();
4845 keep_rds.dedup();
4846
4847 // SAFETY:
4848 // Safe because we are keeping all the descriptors needed for the child to function.
4849 match unsafe { jail.fork(Some(&keep_rds)).context("error while forking")? } {
4850 0 => {
4851 // In the child process.
4852
4853 // Free memory for the resources managed by the parent, without running drop() on them.
4854 // The parent will do it as we exit.
4855 let _ = std::mem::ManuallyDrop::new(parent_resources);
4856
4857 // Make sure the child process does not survive its parent.
4858 // SAFETY: trivially safe
4859 if unsafe { libc::prctl(libc::PR_SET_PDEATHSIG, libc::SIGKILL) } < 0 {
4860 panic!("call to prctl(PR_SET_DEATHSIG, SIGKILL) failed. Aborting child process.");
4861 }
4862
4863 // Set the name for the thread.
4864 const MAX_LEN: usize = 15; // pthread_setname_np() limit on Linux
4865 let debug_label_trimmed = &name.as_bytes()[..std::cmp::min(MAX_LEN, name.len())];
4866 let thread_name = CString::new(debug_label_trimmed).unwrap();
4867 // SAFETY:
4868 // Safe because we trimmed the name to 15 characters (and pthread_setname_np will return
4869 // an error if we don't anyway).
4870 let _ = unsafe { libc::pthread_setname_np(libc::pthread_self(), thread_name.as_ptr()) };
4871
4872 // Run the device loop and terminate the child process once it exits.
4873 let res = match listener.run_device(ex, device) {
4874 Ok(()) => 0,
4875 Err(e) => {
4876 error!("error while running device {}: {:#}", name, e);
4877 1
4878 }
4879 };
4880 // SAFETY: trivially safe
4881 unsafe { libc::exit(res) };
4882 }
4883 pid => {
4884 // In the parent process. We will drop the device and listener when exiting this method.
4885 // This is fine as ownership for both has been transferred to the child process and they
4886 // will keep living there. We just retain `parent_resources` for things we are supposed
4887 // to clean up ourselves.
4888
4889 info!("process for device {} (PID {}) started", &name, pid);
4890 #[cfg(feature = "seccomp_trace")]
4891 debug!(
4892 "seccomp_trace {{\"event\": \"minijail_fork\", \"pid\": {}, \"name\": \"{}\", \"jail_addr\": \"0x{:x}\"}}",
4893 pid,
4894 &name,
4895 read_jail_addr(&jail)
4896 );
4897 Ok((pid, parent_resources))
4898 }
4899 }
4900 }
4901
process_vhost_user_control_request(tube: Tube, disk_host_tubes: &[Tube]) -> Result<()>4902 fn process_vhost_user_control_request(tube: Tube, disk_host_tubes: &[Tube]) -> Result<()> {
4903 let command = tube
4904 .recv::<VmRequest>()
4905 .context("failed to receive VmRequest")?;
4906 let resp = match command {
4907 VmRequest::DiskCommand {
4908 disk_index,
4909 ref command,
4910 } => match &disk_host_tubes.get(disk_index) {
4911 Some(tube) => handle_disk_command(command, tube),
4912 None => VmResponse::Err(base::Error::new(libc::ENODEV)),
4913 },
4914 request => {
4915 error!(
4916 "Request {:?} currently not supported in vhost user backend",
4917 request
4918 );
4919 VmResponse::Err(base::Error::new(libc::EPERM))
4920 }
4921 };
4922
4923 tube.send(&resp).context("failed to send VmResponse")?;
4924 Ok(())
4925 }
4926
start_vhost_user_control_server( control_server_socket: UnlinkUnixSeqpacketListener, disk_host_tubes: Vec<Tube>, )4927 fn start_vhost_user_control_server(
4928 control_server_socket: UnlinkUnixSeqpacketListener,
4929 disk_host_tubes: Vec<Tube>,
4930 ) {
4931 info!("Start vhost-user control server");
4932 loop {
4933 match control_server_socket.accept() {
4934 Ok(socket) => {
4935 let tube = match Tube::try_from(socket) {
4936 Ok(tube) => tube,
4937 Err(e) => {
4938 error!("failed to open tube: {:#}", e);
4939 return;
4940 }
4941 };
4942 if let Err(e) = process_vhost_user_control_request(tube, &disk_host_tubes) {
4943 error!("failed to process control request: {:#}", e);
4944 }
4945 }
4946 Err(e) => {
4947 error!("failed to establish connection: {}", e);
4948 }
4949 }
4950 }
4951 }
4952
start_devices(opts: DevicesCommand) -> anyhow::Result<()>4953 pub fn start_devices(opts: DevicesCommand) -> anyhow::Result<()> {
4954 if let Some(async_executor) = opts.async_executor {
4955 Executor::set_default_executor_kind(async_executor)
4956 .context("Failed to set the default async executor")?;
4957 }
4958
4959 struct DeviceJailInfo {
4960 // Unique name for the device, in the form `foomatic-0`.
4961 name: String,
4962 _drop_resources: Option<Box<dyn std::any::Any>>,
4963 }
4964
4965 fn add_device<T: VirtioDeviceBuilder>(
4966 i: usize,
4967 device_params: T,
4968 vhost: &str,
4969 jail_config: Option<&JailConfig>,
4970 devices_jails: &mut BTreeMap<libc::pid_t, DeviceJailInfo>,
4971 ) -> anyhow::Result<()> {
4972 let name = format!("{}-{}", T::NAME, i);
4973
4974 let (pid, _drop_resources) =
4975 jail_and_start_vu_device::<T>(jail_config, device_params, vhost, &name)?;
4976
4977 devices_jails.insert(
4978 pid,
4979 DeviceJailInfo {
4980 name,
4981 _drop_resources,
4982 },
4983 );
4984
4985 Ok(())
4986 }
4987
4988 let mut devices_jails: BTreeMap<libc::pid_t, DeviceJailInfo> = BTreeMap::new();
4989
4990 let jail = if opts.disable_sandbox {
4991 None
4992 } else {
4993 Some(&opts.jail)
4994 };
4995
4996 // Create control server socket
4997 let control_server_socket = opts.control_socket.map(|path| {
4998 UnlinkUnixSeqpacketListener(
4999 UnixSeqpacketListener::bind(path).expect("Could not bind socket"),
5000 )
5001 });
5002
5003 // Create serial devices.
5004 for (i, params) in opts.serial.iter().enumerate() {
5005 let serial_config = ¶ms.device;
5006 add_device(i, serial_config, ¶ms.vhost, jail, &mut devices_jails)?;
5007 }
5008
5009 let mut disk_host_tubes = Vec::new();
5010 let control_socket_exists = control_server_socket.is_some();
5011 // Create block devices.
5012 for (i, params) in opts.block.iter().enumerate() {
5013 let tube = if control_socket_exists {
5014 let (host_tube, device_tube) = Tube::pair().context("failed to create tube")?;
5015 disk_host_tubes.push(host_tube);
5016 Some(device_tube)
5017 } else {
5018 None
5019 };
5020 let disk_config = DiskConfig::new(¶ms.device, tube);
5021 add_device(i, disk_config, ¶ms.vhost, jail, &mut devices_jails)?;
5022 }
5023
5024 // Create vsock devices.
5025 for (i, params) in opts.vsock.iter().enumerate() {
5026 add_device(i, ¶ms.device, ¶ms.vhost, jail, &mut devices_jails)?;
5027 }
5028
5029 // Create network devices.
5030 #[cfg(feature = "net")]
5031 for (i, params) in opts.net.iter().enumerate() {
5032 add_device(i, ¶ms.device, ¶ms.vhost, jail, &mut devices_jails)?;
5033 }
5034
5035 // No device created, that's probably not intended - print the help in that case.
5036 if devices_jails.is_empty() {
5037 let err = DevicesCommand::from_args(
5038 &[&std::env::args().next().unwrap_or(String::from("crosvm"))],
5039 &["--help"],
5040 )
5041 .unwrap_err();
5042 println!("{}", err.output);
5043 return Ok(());
5044 }
5045
5046 let ex = Executor::new()?;
5047 if let Some(control_server_socket) = control_server_socket {
5048 // Start the control server in the parent process.
5049 ex.spawn_blocking(move || {
5050 start_vhost_user_control_server(control_server_socket, disk_host_tubes)
5051 })
5052 .detach();
5053 }
5054
5055 // Now wait for all device processes to return.
5056 while !devices_jails.is_empty() {
5057 match base::linux::wait_for_pid(-1, 0) {
5058 Err(e) => panic!("error waiting for child process to complete: {:#}", e),
5059 Ok((Some(pid), wait_status)) => match devices_jails.remove_entry(&pid) {
5060 Some((_, info)) => {
5061 if let Some(status) = wait_status.code() {
5062 info!(
5063 "process for device {} (PID {}) exited with code {}",
5064 &info.name, pid, status
5065 );
5066 } else if let Some(signal) = wait_status.signal() {
5067 warn!(
5068 "process for device {} (PID {}) has been killed by signal {:?}",
5069 &info.name, pid, signal,
5070 );
5071 }
5072 }
5073 None => error!("pid {} is not one of our device processes", pid),
5074 },
5075 // `wait_for_pid` will necessarily return a PID because we asked to it wait for one to
5076 // complete.
5077 Ok((None, _)) => unreachable!(),
5078 }
5079 }
5080
5081 info!("all device processes have exited");
5082
5083 Ok(())
5084 }
5085
5086 /// Setup crash reporting for a process. Each process MUST provide a unique `product_type` to avoid
5087 /// making crash reports incomprehensible.
5088 #[cfg(feature = "crash-report")]
setup_emulator_crash_reporting(_cfg: &Config) -> anyhow::Result<String>5089 pub fn setup_emulator_crash_reporting(_cfg: &Config) -> anyhow::Result<String> {
5090 crash_report::setup_crash_reporting(crash_report::CrashReportAttributes {
5091 product_type: "emulator".to_owned(),
5092 pipe_name: None,
5093 report_uuid: None,
5094 product_name: None,
5095 product_version: None,
5096 })
5097 }
5098
5099 #[cfg(test)]
5100 mod tests {
5101 use std::path::PathBuf;
5102
5103 use vm_memory::MemoryRegionPurpose;
5104
5105 use super::*;
5106
5107 // Create a file-backed mapping parameters struct with the given `address` and `size` and other
5108 // parameters set to default values.
test_file_backed_mapping(address: u64, size: u64) -> FileBackedMappingParameters5109 fn test_file_backed_mapping(address: u64, size: u64) -> FileBackedMappingParameters {
5110 FileBackedMappingParameters {
5111 address,
5112 size,
5113 path: PathBuf::new(),
5114 offset: 0,
5115 writable: false,
5116 sync: false,
5117 align: false,
5118 ram: true,
5119 }
5120 }
5121
5122 #[test]
guest_mem_file_backed_mappings_overlap()5123 fn guest_mem_file_backed_mappings_overlap() {
5124 // Base case: no file mappings; output layout should be identical.
5125 assert_eq!(
5126 punch_holes_in_guest_mem_layout_for_mappings(
5127 vec![
5128 (GuestAddress(0), 0xD000_0000, Default::default()),
5129 (GuestAddress(0x1_0000_0000), 0x8_0000, Default::default()),
5130 ],
5131 &[]
5132 )
5133 .unwrap(),
5134 vec![
5135 (GuestAddress(0), 0xD000_0000, Default::default()),
5136 (GuestAddress(0x1_0000_0000), 0x8_0000, Default::default()),
5137 ],
5138 );
5139
5140 // File mapping that does not overlap guest memory.
5141 assert_eq!(
5142 punch_holes_in_guest_mem_layout_for_mappings(
5143 vec![
5144 (GuestAddress(0), 0xD000_0000, Default::default()),
5145 (GuestAddress(0x1_0000_0000), 0x8_0000, Default::default()),
5146 ],
5147 &[test_file_backed_mapping(0xD000_0000, 0x1000)]
5148 )
5149 .unwrap_err()
5150 .to_string(),
5151 "RAM file-backed-mapping must be a subset of a RAM region",
5152 );
5153
5154 // File mapping at the start of the low address space region.
5155 assert_eq!(
5156 punch_holes_in_guest_mem_layout_for_mappings(
5157 vec![
5158 (GuestAddress(0), 0xD000_0000, Default::default()),
5159 (GuestAddress(0x1_0000_0000), 0x8_0000, Default::default()),
5160 ],
5161 &[test_file_backed_mapping(0, 0x2000)]
5162 )
5163 .unwrap(),
5164 vec![
5165 (
5166 GuestAddress(0),
5167 0x2000,
5168 MemoryRegionOptions::new()
5169 .purpose(MemoryRegionPurpose::GuestMemoryRegion)
5170 .file_backed(test_file_backed_mapping(0, 0x2000)),
5171 ),
5172 (
5173 GuestAddress(0x2000),
5174 0xD000_0000 - 0x2000,
5175 Default::default()
5176 ),
5177 (GuestAddress(0x1_0000_0000), 0x8_0000, Default::default()),
5178 ],
5179 );
5180
5181 // File mapping at the end of the low address space region.
5182 assert_eq!(
5183 punch_holes_in_guest_mem_layout_for_mappings(
5184 vec![
5185 (GuestAddress(0), 0xD000_0000, Default::default()),
5186 (GuestAddress(0x1_0000_0000), 0x8_0000, Default::default()),
5187 ],
5188 &[test_file_backed_mapping(0xD000_0000 - 0x2000, 0x2000)]
5189 )
5190 .unwrap(),
5191 vec![
5192 (GuestAddress(0), 0xD000_0000 - 0x2000, Default::default()),
5193 (
5194 GuestAddress(0xD000_0000 - 0x2000),
5195 0x2000,
5196 MemoryRegionOptions::new()
5197 .purpose(MemoryRegionPurpose::GuestMemoryRegion)
5198 .file_backed(test_file_backed_mapping(0xD000_0000 - 0x2000, 0x2000)),
5199 ),
5200 (GuestAddress(0x1_0000_0000), 0x8_0000, Default::default()),
5201 ],
5202 );
5203
5204 // File mapping fully contained within the middle of the low address space region.
5205 assert_eq!(
5206 punch_holes_in_guest_mem_layout_for_mappings(
5207 vec![
5208 (GuestAddress(0), 0xD000_0000, Default::default()),
5209 (GuestAddress(0x1_0000_0000), 0x8_0000, Default::default()),
5210 ],
5211 &[test_file_backed_mapping(0x1000, 0x2000)]
5212 )
5213 .unwrap(),
5214 vec![
5215 (GuestAddress(0), 0x1000, Default::default()),
5216 (
5217 GuestAddress(0x1000),
5218 0x2000,
5219 MemoryRegionOptions::new()
5220 .purpose(MemoryRegionPurpose::GuestMemoryRegion)
5221 .file_backed(test_file_backed_mapping(0x1000, 0x2000)),
5222 ),
5223 (
5224 GuestAddress(0x3000),
5225 0xD000_0000 - 0x3000,
5226 Default::default()
5227 ),
5228 (GuestAddress(0x1_0000_0000), 0x8_0000, Default::default()),
5229 ],
5230 );
5231
5232 // File mapping at the start of the high address space region.
5233 assert_eq!(
5234 punch_holes_in_guest_mem_layout_for_mappings(
5235 vec![
5236 (GuestAddress(0), 0xD000_0000, Default::default()),
5237 (GuestAddress(0x1_0000_0000), 0x8_0000, Default::default()),
5238 ],
5239 &[test_file_backed_mapping(0x1_0000_0000, 0x2000)]
5240 )
5241 .unwrap(),
5242 vec![
5243 (GuestAddress(0), 0xD000_0000, Default::default()),
5244 (
5245 GuestAddress(0x1_0000_0000),
5246 0x2000,
5247 MemoryRegionOptions::new()
5248 .purpose(MemoryRegionPurpose::GuestMemoryRegion)
5249 .file_backed(test_file_backed_mapping(0x1_0000_0000, 0x2000)),
5250 ),
5251 (
5252 GuestAddress(0x1_0000_2000),
5253 0x8_0000 - 0x2000,
5254 Default::default()
5255 ),
5256 ],
5257 );
5258
5259 // File mapping at the end of the high address space region.
5260 assert_eq!(
5261 punch_holes_in_guest_mem_layout_for_mappings(
5262 vec![
5263 (GuestAddress(0), 0xD000_0000, Default::default()),
5264 (GuestAddress(0x1_0000_0000), 0x8_0000, Default::default()),
5265 ],
5266 &[test_file_backed_mapping(0x1_0008_0000 - 0x2000, 0x2000)]
5267 )
5268 .unwrap(),
5269 vec![
5270 (GuestAddress(0), 0xD000_0000, Default::default()),
5271 (
5272 GuestAddress(0x1_0000_0000),
5273 0x8_0000 - 0x2000,
5274 Default::default()
5275 ),
5276 (
5277 GuestAddress(0x1_0008_0000 - 0x2000),
5278 0x2000,
5279 MemoryRegionOptions::new()
5280 .purpose(MemoryRegionPurpose::GuestMemoryRegion)
5281 .file_backed(test_file_backed_mapping(0x1_0008_0000 - 0x2000, 0x2000)),
5282 ),
5283 ],
5284 );
5285
5286 // File mapping fully contained within the middle of the high address space region.
5287 assert_eq!(
5288 punch_holes_in_guest_mem_layout_for_mappings(
5289 vec![
5290 (GuestAddress(0), 0xD000_0000, Default::default()),
5291 (GuestAddress(0x1_0000_0000), 0x8_0000, Default::default()),
5292 ],
5293 &[test_file_backed_mapping(0x1_0000_1000, 0x2000)]
5294 )
5295 .unwrap(),
5296 vec![
5297 (GuestAddress(0), 0xD000_0000, Default::default()),
5298 (GuestAddress(0x1_0000_0000), 0x1000, Default::default()),
5299 (
5300 GuestAddress(0x1_0000_1000),
5301 0x2000,
5302 MemoryRegionOptions::new()
5303 .purpose(MemoryRegionPurpose::GuestMemoryRegion)
5304 .file_backed(test_file_backed_mapping(0x1_0000_1000, 0x2000)),
5305 ),
5306 (
5307 GuestAddress(0x1_0000_3000),
5308 0x8_0000 - 0x3000,
5309 Default::default()
5310 ),
5311 ],
5312 );
5313
5314 // File mapping overlapping two guest memory regions.
5315 assert_eq!(
5316 punch_holes_in_guest_mem_layout_for_mappings(
5317 vec![
5318 (GuestAddress(0), 0xD000_0000, Default::default()),
5319 (GuestAddress(0x1_0000_0000), 0x8_0000, Default::default()),
5320 ],
5321 &[test_file_backed_mapping(0xA000_0000, 0x60002000)]
5322 )
5323 .unwrap_err()
5324 .to_string(),
5325 "RAM file-backed-mapping must be a subset of a RAM region",
5326 );
5327
5328 // File mapping with different region purpose.
5329 assert_eq!(
5330 punch_holes_in_guest_mem_layout_for_mappings(
5331 vec![
5332 (GuestAddress(0x0000), 0x2000, Default::default()),
5333 (
5334 GuestAddress(0x2000),
5335 0x2000,
5336 MemoryRegionOptions::new().purpose(MemoryRegionPurpose::Bios)
5337 ),
5338 ],
5339 &[test_file_backed_mapping(0x2000, 0x2000)]
5340 )
5341 .unwrap(),
5342 vec![
5343 (GuestAddress(0x0000), 0x2000, Default::default()),
5344 (
5345 GuestAddress(0x2000),
5346 0x2000,
5347 MemoryRegionOptions::new()
5348 .purpose(MemoryRegionPurpose::Bios)
5349 .file_backed(test_file_backed_mapping(0x2000, 0x2000)),
5350 ),
5351 ],
5352 );
5353 }
5354
5355 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
5356 #[test]
normalized_cpu_ipc_ratios_simple()5357 fn normalized_cpu_ipc_ratios_simple() {
5358 let host_max_freq = 5000000;
5359 let mut cpu_frequencies = BTreeMap::new();
5360 cpu_frequencies.insert(0, vec![100000, 200000, 500000]);
5361 cpu_frequencies.insert(1, vec![50000, 75000, 200000]);
5362
5363 let mut cpu_ipc_ratio = BTreeMap::new();
5364 cpu_ipc_ratio.insert(0, 1024);
5365 cpu_ipc_ratio.insert(1, 512);
5366
5367 let normalized_cpu_ipc_ratios = normalize_cpu_ipc_ratios(
5368 cpu_frequencies.iter().map(|(cpu_id, frequencies)| {
5369 (
5370 *cpu_id,
5371 frequencies.iter().copied().max().unwrap_or_default(),
5372 )
5373 }),
5374 host_max_freq,
5375 |cpu_id| cpu_ipc_ratio.get(&cpu_id).copied().unwrap_or(1024),
5376 )
5377 .expect("normalize_cpu_ipc_ratios failed");
5378
5379 let ratios: Vec<(usize, u32)> = normalized_cpu_ipc_ratios.into_iter().collect();
5380 assert_eq!(ratios, vec![(0, 102), (1, 20)]);
5381 }
5382 }
5383