1 // Copyright 2022 The ChromiumOS Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 // TODO(b:240716507): There is huge chunk for code which depends on haxm, whpx or gvm to be enabled
6 // but isn't marked so. Remove this when we do so.
7 #![allow(dead_code, unused_imports, unused_variables, unreachable_code)]
8
9 pub(crate) mod control_server;
10 pub(crate) mod irq_wait;
11 pub(crate) mod main;
12 #[cfg(not(feature = "crash-report"))]
13 mod panic_hook;
14
15 mod generic;
16 use generic as product;
17 pub(crate) mod run_vcpu;
18
19 #[cfg(feature = "whpx")]
20 use std::arch::x86_64::__cpuid;
21 #[cfg(feature = "whpx")]
22 use std::arch::x86_64::__cpuid_count;
23 use std::cmp::Reverse;
24 use std::collections::BTreeMap;
25 use std::collections::HashMap;
26 use std::fs::File;
27 use std::fs::OpenOptions;
28 use std::io::stdin;
29 use std::iter;
30 use std::mem;
31 use std::os::windows::fs::OpenOptionsExt;
32 use std::path::PathBuf;
33 use std::sync::mpsc;
34 use std::sync::Arc;
35
36 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
37 use aarch64::AArch64 as Arch;
38 use acpi_tables::sdt::SDT;
39 use anyhow::anyhow;
40 use anyhow::bail;
41 use anyhow::Context;
42 use anyhow::Result;
43 use arch::CpuConfigArch;
44 use arch::DtbOverlay;
45 use arch::IrqChipArch;
46 use arch::LinuxArch;
47 use arch::RunnableLinuxVm;
48 use arch::VcpuArch;
49 use arch::VirtioDeviceStub;
50 use arch::VmArch;
51 use arch::VmComponents;
52 use arch::VmImage;
53 use base::enable_high_res_timers;
54 use base::error;
55 use base::info;
56 use base::open_file_or_duplicate;
57 use base::warn;
58 use base::AsRawDescriptor;
59 #[cfg(feature = "gpu")]
60 use base::BlockingMode;
61 use base::CloseNotifier;
62 use base::Event;
63 use base::EventToken;
64 use base::EventType;
65 use base::FlushOnDropTube;
66 #[cfg(feature = "gpu")]
67 use base::FramingMode;
68 use base::FromRawDescriptor;
69 use base::ProtoTube;
70 use base::RawDescriptor;
71 use base::ReadNotifier;
72 use base::RecvTube;
73 use base::SendTube;
74 #[cfg(feature = "gpu")]
75 use base::StreamChannel;
76 use base::Terminal;
77 use base::TriggeredEvent;
78 use base::Tube;
79 use base::TubeError;
80 use base::VmEventType;
81 use base::WaitContext;
82 use broker_ipc::common_child_setup;
83 use broker_ipc::CommonChildStartupArgs;
84 use control_server::ControlServer;
85 use crosvm_cli::sys::windows::exit::Exit;
86 use crosvm_cli::sys::windows::exit::ExitContext;
87 use crosvm_cli::sys::windows::exit::ExitContextAnyhow;
88 use crosvm_cli::sys::windows::exit::ExitContextOption;
89 use devices::create_devices_worker_thread;
90 use devices::serial_device::SerialHardware;
91 use devices::serial_device::SerialParameters;
92 use devices::tsc::get_tsc_sync_mitigations;
93 use devices::tsc::standard_deviation;
94 use devices::tsc::TscSyncMitigations;
95 use devices::virtio;
96 use devices::virtio::block::DiskOption;
97 #[cfg(feature = "audio")]
98 use devices::virtio::snd::common_backend::VirtioSnd;
99 #[cfg(feature = "audio")]
100 use devices::virtio::snd::parameters::Parameters as SndParameters;
101 #[cfg(feature = "gpu")]
102 use devices::virtio::vhost::user::device::gpu::sys::windows::GpuVmmConfig;
103 #[cfg(feature = "gpu")]
104 use devices::virtio::vhost::user::device::gpu::sys::windows::InputEventSplitConfig;
105 #[cfg(feature = "gpu")]
106 use devices::virtio::vhost::user::device::gpu::sys::windows::InputEventVmmConfig;
107 #[cfg(feature = "gpu")]
108 use devices::virtio::vhost::user::gpu::sys::windows::product::GpuBackendConfig as GpuBackendConfigProduct;
109 #[cfg(feature = "gpu")]
110 use devices::virtio::vhost::user::gpu::sys::windows::run_gpu_device_worker;
111 #[cfg(feature = "audio")]
112 use devices::virtio::vhost::user::snd::sys::windows::product::SndBackendConfig as SndBackendConfigProduct;
113 #[cfg(feature = "balloon")]
114 use devices::virtio::BalloonFeatures;
115 #[cfg(feature = "balloon")]
116 use devices::virtio::BalloonMode;
117 use devices::virtio::Console;
118 #[cfg(feature = "gpu")]
119 use devices::virtio::GpuParameters;
120 use devices::BusDeviceObj;
121 #[cfg(feature = "gvm")]
122 use devices::GvmIrqChip;
123 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
124 use devices::IrqChip;
125 use devices::UserspaceIrqChip;
126 use devices::VcpuRunState;
127 use devices::VirtioPciDevice;
128 #[cfg(feature = "whpx")]
129 use devices::WhpxSplitIrqChip;
130 #[cfg(feature = "gpu")]
131 use gpu_display::EventDevice;
132 #[cfg(feature = "gpu")]
133 use gpu_display::WindowProcedureThread;
134 #[cfg(feature = "gpu")]
135 use gpu_display::WindowProcedureThreadBuilder;
136 #[cfg(feature = "gvm")]
137 use hypervisor::gvm::Gvm;
138 #[cfg(feature = "gvm")]
139 use hypervisor::gvm::GvmVcpu;
140 #[cfg(feature = "gvm")]
141 use hypervisor::gvm::GvmVersion;
142 #[cfg(feature = "gvm")]
143 use hypervisor::gvm::GvmVm;
144 #[cfg(feature = "haxm")]
145 use hypervisor::haxm::get_use_ghaxm;
146 #[cfg(feature = "haxm")]
147 use hypervisor::haxm::set_use_ghaxm;
148 #[cfg(feature = "haxm")]
149 use hypervisor::haxm::Haxm;
150 #[cfg(feature = "haxm")]
151 use hypervisor::haxm::HaxmVcpu;
152 #[cfg(feature = "haxm")]
153 use hypervisor::haxm::HaxmVm;
154 #[cfg(feature = "whpx")]
155 use hypervisor::whpx::Whpx;
156 #[cfg(feature = "whpx")]
157 use hypervisor::whpx::WhpxFeature;
158 #[cfg(feature = "whpx")]
159 use hypervisor::whpx::WhpxVcpu;
160 #[cfg(feature = "whpx")]
161 use hypervisor::whpx::WhpxVm;
162 use hypervisor::Hypervisor;
163 #[cfg(feature = "whpx")]
164 use hypervisor::HypervisorCap;
165 #[cfg(feature = "whpx")]
166 use hypervisor::HypervisorX86_64;
167 use hypervisor::ProtectionType;
168 use hypervisor::Vm;
169 use irq_wait::IrqWaitWorker;
170 use jail::FakeMinijailStub as Minijail;
171 #[cfg(not(feature = "crash-report"))]
172 pub(crate) use panic_hook::set_panic_hook;
173 use product::create_snd_mute_tube_pair;
174 #[cfg(any(feature = "haxm", feature = "gvm", feature = "whpx"))]
175 use product::create_snd_state_tube;
176 use product::handle_pvclock_request;
177 use product::merge_session_invariants;
178 use product::run_ime_thread;
179 use product::set_package_name;
180 pub(crate) use product::setup_metrics_reporting;
181 use product::start_service_ipc_listener;
182 use product::RunControlArgs;
183 use product::ServiceVmState;
184 use product::Token;
185 use resources::SystemAllocator;
186 use run_vcpu::run_all_vcpus;
187 use run_vcpu::VcpuRunMode;
188 use rutabaga_gfx::RutabagaGralloc;
189 use rutabaga_gfx::RutabagaGrallocBackendFlags;
190 use smallvec::SmallVec;
191 use sync::Mutex;
192 use tube_transporter::TubeToken;
193 use tube_transporter::TubeTransporterReader;
194 use vm_control::api::VmMemoryClient;
195 #[cfg(feature = "balloon")]
196 use vm_control::BalloonControlCommand;
197 #[cfg(feature = "balloon")]
198 use vm_control::BalloonTube;
199 use vm_control::DeviceControlCommand;
200 use vm_control::IrqHandlerRequest;
201 use vm_control::PvClockCommand;
202 use vm_control::VcpuControl;
203 use vm_control::VmMemoryRegionState;
204 use vm_control::VmMemoryRequest;
205 use vm_control::VmRequest;
206 use vm_control::VmResponse;
207 use vm_control::VmRunMode;
208 use vm_memory::GuestAddress;
209 use vm_memory::GuestMemory;
210 use win_util::ProcessType;
211 #[cfg(feature = "whpx")]
212 use x86_64::cpuid::adjust_cpuid;
213 #[cfg(feature = "whpx")]
214 use x86_64::cpuid::CpuIdContext;
215 #[cfg(all(target_arch = "x86_64", feature = "haxm"))]
216 use x86_64::get_cpu_manufacturer;
217 #[cfg(all(target_arch = "x86_64", feature = "haxm"))]
218 use x86_64::CpuManufacturer;
219 #[cfg(target_arch = "x86_64")]
220 use x86_64::X8664arch as Arch;
221
222 use crate::crosvm::config::Config;
223 use crate::crosvm::config::Executable;
224 use crate::crosvm::config::InputDeviceOption;
225 #[cfg(any(feature = "gvm", feature = "whpx"))]
226 use crate::crosvm::config::IrqChipKind;
227 #[cfg(feature = "gpu")]
228 use crate::crosvm::config::TouchDeviceOption;
229 use crate::crosvm::config::DEFAULT_TOUCH_DEVICE_HEIGHT;
230 use crate::crosvm::config::DEFAULT_TOUCH_DEVICE_WIDTH;
231 use crate::crosvm::sys::config::HypervisorKind;
232 use crate::crosvm::sys::windows::broker::BrokerTubes;
233 #[cfg(feature = "stats")]
234 use crate::crosvm::sys::windows::stats::StatisticsCollector;
235 #[cfg(feature = "gpu")]
236 pub(crate) use crate::sys::windows::product::get_gpu_product_configs;
237 #[cfg(feature = "audio")]
238 pub(crate) use crate::sys::windows::product::get_snd_product_configs;
239 #[cfg(feature = "gpu")]
240 pub(crate) use crate::sys::windows::product::get_window_procedure_thread_product_configs;
241 use crate::sys::windows::product::log_descriptor;
242 #[cfg(feature = "audio")]
243 pub(crate) use crate::sys::windows::product::num_input_sound_devices;
244 #[cfg(feature = "audio")]
245 pub(crate) use crate::sys::windows::product::num_input_sound_streams;
246 use crate::sys::windows::product::spawn_anti_tamper_thread;
247 use crate::sys::windows::product::MetricEventType;
248
249 const DEFAULT_GUEST_CID: u64 = 3;
250
251 // by default, if enabled, the balloon WS features will use 4 bins.
252 const VIRTIO_BALLOON_WS_DEFAULT_NUM_BINS: u8 = 4;
253
254 enum TaggedControlTube {
255 Vm(FlushOnDropTube),
256 Product(product::TaggedControlTube),
257 }
258
259 impl ReadNotifier for TaggedControlTube {
get_read_notifier(&self) -> &dyn AsRawDescriptor260 fn get_read_notifier(&self) -> &dyn AsRawDescriptor {
261 match self {
262 Self::Vm(tube) => tube.0.get_read_notifier(),
263 Self::Product(tube) => tube.get_read_notifier(),
264 }
265 }
266 }
267
268 impl CloseNotifier for TaggedControlTube {
get_close_notifier(&self) -> &dyn AsRawDescriptor269 fn get_close_notifier(&self) -> &dyn AsRawDescriptor {
270 match self {
271 Self::Vm(tube) => tube.0.get_close_notifier(),
272 Self::Product(tube) => tube.get_close_notifier(),
273 }
274 }
275 }
276
277 pub enum ExitState {
278 Reset,
279 Stop,
280 Crash,
281 #[allow(dead_code)]
282 GuestPanic,
283 WatchdogReset,
284 }
285
286 type DeviceResult<T = VirtioDeviceStub> = Result<T>;
287
create_vhost_user_block_device(cfg: &Config, disk_device_tube: Tube) -> DeviceResult288 fn create_vhost_user_block_device(cfg: &Config, disk_device_tube: Tube) -> DeviceResult {
289 let dev = virtio::VhostUserFrontend::new(
290 virtio::DeviceType::Block,
291 virtio::base_features(cfg.protection_type),
292 disk_device_tube,
293 None,
294 None,
295 )
296 .exit_context(
297 Exit::VhostUserBlockDeviceNew,
298 "failed to set up vhost-user block device",
299 )?;
300
301 Ok(VirtioDeviceStub {
302 dev: Box::new(dev),
303 jail: None,
304 })
305 }
306
create_block_device(cfg: &Config, disk: &DiskOption, disk_device_tube: Tube) -> DeviceResult307 fn create_block_device(cfg: &Config, disk: &DiskOption, disk_device_tube: Tube) -> DeviceResult {
308 let features = virtio::base_features(cfg.protection_type);
309 let dev = virtio::BlockAsync::new(
310 features,
311 disk.open()?,
312 disk,
313 Some(disk_device_tube),
314 None,
315 None,
316 )
317 .exit_context(Exit::BlockDeviceNew, "failed to create block device")?;
318
319 Ok(VirtioDeviceStub {
320 dev: Box::new(dev),
321 jail: None,
322 })
323 }
324
325 #[cfg(feature = "gpu")]
create_vhost_user_gpu_device(base_features: u64, vhost_user_tube: Tube) -> DeviceResult326 fn create_vhost_user_gpu_device(base_features: u64, vhost_user_tube: Tube) -> DeviceResult {
327 let dev = virtio::VhostUserFrontend::new(
328 virtio::DeviceType::Gpu,
329 base_features,
330 vhost_user_tube,
331 None,
332 None,
333 )
334 .exit_context(
335 Exit::VhostUserGpuDeviceNew,
336 "failed to set up vhost-user gpu device",
337 )?;
338
339 Ok(VirtioDeviceStub {
340 dev: Box::new(dev),
341 jail: None,
342 })
343 }
344
345 #[cfg(feature = "audio")]
create_snd_device( cfg: &Config, parameters: SndParameters, _product_args: SndBackendConfigProduct, ) -> DeviceResult346 fn create_snd_device(
347 cfg: &Config,
348 parameters: SndParameters,
349 _product_args: SndBackendConfigProduct,
350 ) -> DeviceResult {
351 let features = virtio::base_features(cfg.protection_type);
352 let dev = VirtioSnd::new(features, parameters)
353 .exit_context(Exit::VirtioSoundDeviceNew, "failed to create snd device")?;
354
355 Ok(VirtioDeviceStub {
356 dev: Box::new(dev),
357 jail: None,
358 })
359 }
360
361 #[cfg(feature = "audio")]
create_vhost_user_snd_device(base_features: u64, vhost_user_tube: Tube) -> DeviceResult362 fn create_vhost_user_snd_device(base_features: u64, vhost_user_tube: Tube) -> DeviceResult {
363 let dev = virtio::VhostUserFrontend::new(
364 virtio::DeviceType::Sound,
365 base_features,
366 vhost_user_tube,
367 None,
368 None,
369 )
370 .exit_context(
371 Exit::VhostUserSndDeviceNew,
372 "failed to set up vhost-user snd device",
373 )?;
374
375 Ok(VirtioDeviceStub {
376 dev: Box::new(dev),
377 jail: None,
378 })
379 }
380
381 #[cfg(feature = "gpu")]
create_multi_touch_device( cfg: &Config, event_pipe: StreamChannel, width: u32, height: u32, name: Option<&str>, idx: u32, ) -> DeviceResult382 fn create_multi_touch_device(
383 cfg: &Config,
384 event_pipe: StreamChannel,
385 width: u32,
386 height: u32,
387 name: Option<&str>,
388 idx: u32,
389 ) -> DeviceResult {
390 let dev = virtio::input::new_multi_touch(
391 idx,
392 event_pipe,
393 width,
394 height,
395 name,
396 virtio::base_features(cfg.protection_type),
397 )
398 .exit_context(Exit::InputDeviceNew, "failed to set up input device")?;
399 Ok(VirtioDeviceStub {
400 dev: Box::new(dev),
401 jail: None,
402 })
403 }
404
405 #[cfg(feature = "gpu")]
create_mouse_device(cfg: &Config, event_pipe: StreamChannel, idx: u32) -> DeviceResult406 fn create_mouse_device(cfg: &Config, event_pipe: StreamChannel, idx: u32) -> DeviceResult {
407 let dev = virtio::input::new_mouse(idx, event_pipe, virtio::base_features(cfg.protection_type))
408 .exit_context(Exit::InputDeviceNew, "failed to set up input device")?;
409 Ok(VirtioDeviceStub {
410 dev: Box::new(dev),
411 jail: None,
412 })
413 }
414
415 #[cfg(feature = "slirp")]
create_vhost_user_net_device(cfg: &Config, net_device_tube: Tube) -> DeviceResult416 fn create_vhost_user_net_device(cfg: &Config, net_device_tube: Tube) -> DeviceResult {
417 let features = virtio::base_features(cfg.protection_type);
418 let dev = virtio::VhostUserFrontend::new(
419 virtio::DeviceType::Net,
420 features,
421 net_device_tube,
422 None,
423 None,
424 )
425 .exit_context(
426 Exit::VhostUserNetDeviceNew,
427 "failed to set up vhost-user net device",
428 )?;
429
430 Ok(VirtioDeviceStub {
431 dev: Box::new(dev),
432 jail: None,
433 })
434 }
435
create_rng_device(cfg: &Config) -> DeviceResult436 fn create_rng_device(cfg: &Config) -> DeviceResult {
437 let dev = virtio::Rng::new(virtio::base_features(cfg.protection_type))
438 .exit_context(Exit::RngDeviceNew, "failed to set up rng")?;
439
440 Ok(VirtioDeviceStub {
441 dev: Box::new(dev),
442 jail: None,
443 })
444 }
445
create_console_device(cfg: &Config, param: &SerialParameters) -> DeviceResult446 fn create_console_device(cfg: &Config, param: &SerialParameters) -> DeviceResult {
447 let mut keep_rds = Vec::new();
448 let evt = Event::new().exit_context(Exit::CreateEvent, "failed to create event")?;
449 let dev = param
450 .create_serial_device::<Console>(cfg.protection_type, &evt, &mut keep_rds)
451 .exit_context(Exit::CreateConsole, "failed to create console device")?;
452
453 Ok(VirtioDeviceStub {
454 dev: Box::new(dev),
455 jail: None,
456 })
457 }
458
459 #[cfg(feature = "balloon")]
create_balloon_device( cfg: &Config, balloon_device_tube: Tube, dynamic_mapping_device_tube: Tube, inflate_tube: Option<Tube>, init_balloon_size: u64, ) -> DeviceResult460 fn create_balloon_device(
461 cfg: &Config,
462 balloon_device_tube: Tube,
463 dynamic_mapping_device_tube: Tube,
464 inflate_tube: Option<Tube>,
465 init_balloon_size: u64,
466 ) -> DeviceResult {
467 let balloon_features =
468 (cfg.balloon_page_reporting as u64) << BalloonFeatures::PageReporting as u64;
469 let dev = virtio::Balloon::new(
470 virtio::base_features(cfg.protection_type),
471 balloon_device_tube,
472 VmMemoryClient::new(dynamic_mapping_device_tube),
473 inflate_tube,
474 init_balloon_size,
475 if cfg.strict_balloon {
476 BalloonMode::Strict
477 } else {
478 BalloonMode::Relaxed
479 },
480 balloon_features,
481 #[cfg(feature = "registered_events")]
482 None,
483 VIRTIO_BALLOON_WS_DEFAULT_NUM_BINS,
484 )
485 .exit_context(Exit::BalloonDeviceNew, "failed to create balloon")?;
486
487 Ok(VirtioDeviceStub {
488 dev: Box::new(dev),
489 jail: None,
490 })
491 }
492
create_vsock_device(cfg: &Config) -> DeviceResult493 fn create_vsock_device(cfg: &Config) -> DeviceResult {
494 // We only support a single guest, so we can confidently assign a default
495 // CID if one isn't provided. We choose the lowest non-reserved value.
496 let dev = virtio::vsock::Vsock::new(
497 cfg.vsock
498 .as_ref()
499 .map(|cfg| cfg.cid)
500 .unwrap_or(DEFAULT_GUEST_CID),
501 cfg.host_guid.clone(),
502 virtio::base_features(cfg.protection_type),
503 )
504 .exit_context(
505 Exit::UserspaceVsockDeviceNew,
506 "failed to create userspace vsock device",
507 )?;
508
509 Ok(VirtioDeviceStub {
510 dev: Box::new(dev),
511 jail: None,
512 })
513 }
514
create_virtio_devices( cfg: &mut Config, vm_evt_wrtube: &SendTube, #[allow(clippy::ptr_arg)] control_tubes: &mut Vec<TaggedControlTube>, disk_device_tubes: &mut Vec<Tube>, balloon_device_tube: Option<Tube>, pvclock_device_tube: Option<Tube>, dynamic_mapping_device_tube: Option<Tube>, inflate_tube: Option<Tube>, init_balloon_size: u64, tsc_frequency: u64, virtio_snd_state_device_tube: Option<Tube>, virtio_snd_control_device_tube: Option<Tube>, ) -> DeviceResult<Vec<VirtioDeviceStub>>515 fn create_virtio_devices(
516 cfg: &mut Config,
517 vm_evt_wrtube: &SendTube,
518 #[allow(clippy::ptr_arg)] control_tubes: &mut Vec<TaggedControlTube>,
519 disk_device_tubes: &mut Vec<Tube>,
520 balloon_device_tube: Option<Tube>,
521 pvclock_device_tube: Option<Tube>,
522 dynamic_mapping_device_tube: Option<Tube>,
523 inflate_tube: Option<Tube>,
524 init_balloon_size: u64,
525 tsc_frequency: u64,
526 virtio_snd_state_device_tube: Option<Tube>,
527 virtio_snd_control_device_tube: Option<Tube>,
528 ) -> DeviceResult<Vec<VirtioDeviceStub>> {
529 let mut devs = Vec::new();
530
531 if cfg.block_vhost_user_tube.is_empty() {
532 // Disk devices must precede virtio-console devices or the kernel does not boot.
533 // TODO(b/171215421): figure out why this ordering is required and fix it.
534 for disk in &cfg.disks {
535 let disk_device_tube = disk_device_tubes.remove(0);
536 devs.push(create_block_device(cfg, disk, disk_device_tube)?);
537 }
538 } else {
539 info!("Starting up vhost user block backends...");
540 for _disk in &cfg.disks {
541 let disk_device_tube = cfg.block_vhost_user_tube.remove(0);
542 devs.push(create_vhost_user_block_device(cfg, disk_device_tube)?);
543 }
544 }
545
546 for (_, param) in cfg
547 .serial_parameters
548 .iter()
549 .filter(|(_k, v)| v.hardware == SerialHardware::VirtioConsole)
550 {
551 let dev = create_console_device(cfg, param)?;
552 devs.push(dev);
553 }
554
555 #[cfg(feature = "audio")]
556 if product::virtio_sound_enabled() {
557 let snd_split_config = cfg
558 .snd_split_config
559 .as_mut()
560 .expect("snd_split_config must exist");
561 let snd_vmm_config = snd_split_config
562 .vmm_config
563 .as_mut()
564 .expect("snd_vmm_config must exist");
565 product::push_snd_control_tubes(control_tubes, snd_vmm_config);
566
567 match snd_split_config.backend_config.take() {
568 None => {
569 // No backend config present means the backend is running in another process.
570 devs.push(create_vhost_user_snd_device(
571 virtio::base_features(cfg.protection_type),
572 snd_vmm_config
573 .main_vhost_user_tube
574 .take()
575 .expect("Snd VMM vhost-user tube should be set"),
576 )?);
577 }
578 Some(backend_config) => {
579 // Backend config present, so initialize Snd in this process.
580 devs.push(create_snd_device(
581 cfg,
582 backend_config.parameters,
583 backend_config.product_config,
584 )?);
585 }
586 }
587 }
588
589 if let Some(tube) = pvclock_device_tube {
590 product::push_pvclock_device(cfg, &mut devs, tsc_frequency, tube);
591 }
592
593 devs.push(create_rng_device(cfg)?);
594
595 #[cfg(feature = "slirp")]
596 if let Some(net_vhost_user_tube) = cfg.net_vhost_user_tube.take() {
597 devs.push(create_vhost_user_net_device(cfg, net_vhost_user_tube)?);
598 }
599
600 #[cfg(feature = "balloon")]
601 if let (Some(balloon_device_tube), Some(dynamic_mapping_device_tube)) =
602 (balloon_device_tube, dynamic_mapping_device_tube)
603 {
604 devs.push(create_balloon_device(
605 cfg,
606 balloon_device_tube,
607 dynamic_mapping_device_tube,
608 inflate_tube,
609 init_balloon_size,
610 )?);
611 }
612
613 devs.push(create_vsock_device(cfg)?);
614
615 #[cfg(feature = "gpu")]
616 let event_devices = if let Some(InputEventSplitConfig {
617 backend_config,
618 vmm_config,
619 }) = cfg.input_event_split_config.take()
620 {
621 devs.extend(
622 create_virtio_input_event_devices(cfg, vmm_config)
623 .context("create input event devices")?,
624 );
625 backend_config.map(|cfg| cfg.event_devices)
626 } else {
627 None
628 };
629
630 #[cfg(feature = "gpu")]
631 if let Some(wndproc_thread_vmm_config) = cfg
632 .window_procedure_thread_split_config
633 .as_mut()
634 .map(|split_cfg| &mut split_cfg.vmm_config)
635 {
636 product::push_window_procedure_thread_control_tubes(
637 control_tubes,
638 wndproc_thread_vmm_config,
639 );
640 }
641
642 #[cfg(feature = "gpu")]
643 let mut wndproc_thread = cfg
644 .window_procedure_thread_split_config
645 .as_mut()
646 .and_then(|cfg| cfg.wndproc_thread_builder.take())
647 .map(WindowProcedureThreadBuilder::start_thread)
648 .transpose()
649 .context("Failed to start the window procedure thread.")?;
650
651 #[cfg(feature = "gpu")]
652 if let Some(gpu_vmm_config) = cfg.gpu_vmm_config.take() {
653 devs.push(create_virtio_gpu_device(
654 cfg,
655 gpu_vmm_config,
656 event_devices,
657 &mut wndproc_thread,
658 control_tubes,
659 )?);
660 }
661
662 Ok(devs)
663 }
664
665 #[cfg(feature = "gpu")]
create_virtio_input_event_devices( cfg: &Config, mut input_event_vmm_config: InputEventVmmConfig, ) -> DeviceResult<Vec<VirtioDeviceStub>>666 fn create_virtio_input_event_devices(
667 cfg: &Config,
668 mut input_event_vmm_config: InputEventVmmConfig,
669 ) -> DeviceResult<Vec<VirtioDeviceStub>> {
670 let mut devs = Vec::new();
671
672 // Iterate event devices, create the VMM end.
673 let mut multi_touch_pipes = input_event_vmm_config
674 .multi_touch_pipes
675 .drain(..)
676 .enumerate();
677 for input in &cfg.virtio_input {
678 match input {
679 InputDeviceOption::SingleTouch { .. } => {
680 unimplemented!("--single-touch is no longer supported. Use --multi-touch instead.");
681 }
682 InputDeviceOption::MultiTouch {
683 width,
684 height,
685 name,
686 ..
687 } => {
688 let Some((idx, pipe)) = multi_touch_pipes.next() else {
689 break;
690 };
691 let mut width = *width;
692 let mut height = *height;
693 if idx == 0 {
694 if width.is_none() {
695 width = cfg.display_input_width;
696 }
697 if height.is_none() {
698 height = cfg.display_input_height;
699 }
700 }
701 devs.push(create_multi_touch_device(
702 cfg,
703 pipe,
704 width.unwrap_or(DEFAULT_TOUCH_DEVICE_WIDTH),
705 height.unwrap_or(DEFAULT_TOUCH_DEVICE_HEIGHT),
706 name.as_deref(),
707 idx as u32,
708 )?);
709 }
710 _ => {}
711 }
712 }
713 drop(multi_touch_pipes);
714
715 product::push_mouse_device(cfg, &mut input_event_vmm_config, &mut devs)?;
716
717 for (idx, pipe) in input_event_vmm_config.mouse_pipes.drain(..).enumerate() {
718 devs.push(create_mouse_device(cfg, pipe, idx as u32)?);
719 }
720
721 let keyboard_pipe = input_event_vmm_config
722 .keyboard_pipes
723 .pop()
724 .expect("at least one keyboard should be in GPU VMM config");
725 let dev = virtio::input::new_keyboard(
726 /* idx= */ 0,
727 keyboard_pipe,
728 virtio::base_features(cfg.protection_type),
729 )
730 .exit_context(Exit::InputDeviceNew, "failed to set up input device")?;
731
732 devs.push(VirtioDeviceStub {
733 dev: Box::new(dev),
734 jail: None,
735 });
736
737 Ok(devs)
738 }
739
740 #[cfg(feature = "gpu")]
create_virtio_gpu_device( cfg: &mut Config, mut gpu_vmm_config: GpuVmmConfig, event_devices: Option<Vec<EventDevice>>, wndproc_thread: &mut Option<WindowProcedureThread>, #[allow(clippy::ptr_arg)] control_tubes: &mut Vec<TaggedControlTube>, ) -> DeviceResult<VirtioDeviceStub>741 fn create_virtio_gpu_device(
742 cfg: &mut Config,
743 mut gpu_vmm_config: GpuVmmConfig,
744 event_devices: Option<Vec<EventDevice>>,
745 wndproc_thread: &mut Option<WindowProcedureThread>,
746 #[allow(clippy::ptr_arg)] control_tubes: &mut Vec<TaggedControlTube>,
747 ) -> DeviceResult<VirtioDeviceStub> {
748 let resource_bridges = Vec::<Tube>::new();
749
750 product::push_gpu_control_tubes(control_tubes, &mut gpu_vmm_config);
751
752 // If the GPU backend is passed, start up the vhost-user worker in the main process.
753 if let Some(backend_config) = cfg.gpu_backend_config.take() {
754 let event_devices = event_devices.ok_or_else(|| {
755 anyhow!("event devices are missing when creating virtio-gpu in the current process.")
756 })?;
757 let wndproc_thread = wndproc_thread
758 .take()
759 .ok_or_else(|| anyhow!("Window procedure thread is missing."))?;
760
761 std::thread::spawn(move || {
762 run_gpu_device_worker(backend_config, event_devices, wndproc_thread)
763 });
764 }
765
766 // The GPU is always vhost-user, even if running in the main process.
767 create_vhost_user_gpu_device(
768 virtio::base_features(cfg.protection_type),
769 gpu_vmm_config
770 .main_vhost_user_tube
771 .take()
772 .expect("GPU VMM vhost-user tube should be set"),
773 )
774 .context("create vhost-user GPU device")
775 }
776
create_devices( cfg: &mut Config, mem: &GuestMemory, exit_evt_wrtube: &SendTube, irq_control_tubes: &mut Vec<Tube>, vm_memory_control_tubes: &mut Vec<Tube>, control_tubes: &mut Vec<TaggedControlTube>, disk_device_tubes: &mut Vec<Tube>, balloon_device_tube: Option<Tube>, pvclock_device_tube: Option<Tube>, dynamic_mapping_device_tube: Option<Tube>, inflate_tube: Option<Tube>, init_balloon_size: u64, tsc_frequency: u64, virtio_snd_state_device_tube: Option<Tube>, virtio_snd_control_device_tube: Option<Tube>, ) -> DeviceResult<Vec<(Box<dyn BusDeviceObj>, Option<Minijail>)>>777 fn create_devices(
778 cfg: &mut Config,
779 mem: &GuestMemory,
780 exit_evt_wrtube: &SendTube,
781 irq_control_tubes: &mut Vec<Tube>,
782 vm_memory_control_tubes: &mut Vec<Tube>,
783 control_tubes: &mut Vec<TaggedControlTube>,
784 disk_device_tubes: &mut Vec<Tube>,
785 balloon_device_tube: Option<Tube>,
786 pvclock_device_tube: Option<Tube>,
787 dynamic_mapping_device_tube: Option<Tube>,
788 inflate_tube: Option<Tube>,
789 init_balloon_size: u64,
790 tsc_frequency: u64,
791 virtio_snd_state_device_tube: Option<Tube>,
792 virtio_snd_control_device_tube: Option<Tube>,
793 ) -> DeviceResult<Vec<(Box<dyn BusDeviceObj>, Option<Minijail>)>> {
794 let stubs = create_virtio_devices(
795 cfg,
796 exit_evt_wrtube,
797 control_tubes,
798 disk_device_tubes,
799 balloon_device_tube,
800 pvclock_device_tube,
801 dynamic_mapping_device_tube,
802 inflate_tube,
803 init_balloon_size,
804 tsc_frequency,
805 virtio_snd_state_device_tube,
806 virtio_snd_control_device_tube,
807 )?;
808
809 let mut pci_devices = Vec::new();
810
811 for stub in stubs {
812 let (msi_host_tube, msi_device_tube) =
813 Tube::pair().exit_context(Exit::CreateTube, "failed to create tube")?;
814 irq_control_tubes.push(msi_host_tube);
815
816 let shared_memory_tube = if stub.dev.get_shared_memory_region().is_some() {
817 let (host_tube, device_tube) =
818 Tube::pair().context("failed to create VVU proxy tube")?;
819 vm_memory_control_tubes.push(host_tube);
820 Some(device_tube)
821 } else {
822 None
823 };
824
825 let (ioevent_host_tube, ioevent_device_tube) =
826 Tube::pair().context("failed to create ioevent tube")?;
827 vm_memory_control_tubes.push(ioevent_host_tube);
828
829 let (vm_control_host_tube, vm_control_device_tube) =
830 Tube::pair().context("failed to create vm_control tube")?;
831 control_tubes.push(TaggedControlTube::Vm(FlushOnDropTube::from(
832 vm_control_host_tube,
833 )));
834
835 let dev = Box::new(
836 VirtioPciDevice::new(
837 mem.clone(),
838 stub.dev,
839 msi_device_tube,
840 cfg.disable_virtio_intx,
841 shared_memory_tube.map(VmMemoryClient::new),
842 VmMemoryClient::new(ioevent_device_tube),
843 vm_control_device_tube,
844 )
845 .exit_context(Exit::VirtioPciDev, "failed to create virtio pci dev")?,
846 ) as Box<dyn BusDeviceObj>;
847 pci_devices.push((dev, stub.jail));
848 }
849
850 Ok(pci_devices)
851 }
852
853 #[derive(Debug)]
854 struct PvClockError(String);
855
handle_readable_event<V: VmArch + 'static, Vcpu: VcpuArch + 'static>( event: &TriggeredEvent<Token>, vm_control_ids_to_remove: &mut Vec<usize>, next_control_id: &mut usize, service_vm_state: &mut ServiceVmState, disk_host_tubes: &[Tube], ipc_main_loop_tube: Option<&Tube>, #[cfg(feature = "gpu")] gpu_control_tube: Option<&Tube>, vm_evt_rdtube: &RecvTube, control_tubes: &mut BTreeMap<usize, TaggedControlTube>, guest_os: &mut RunnableLinuxVm<V, Vcpu>, sys_allocator_mutex: &Arc<Mutex<SystemAllocator>>, virtio_snd_host_mute_tube: &mut Option<Tube>, proto_main_loop_tube: Option<&ProtoTube>, anti_tamper_main_thread_tube: &Option<ProtoTube>, #[cfg(feature = "balloon")] mut balloon_tube: Option<&mut BalloonTube>, memory_size_mb: u64, vcpu_boxes: &Mutex<Vec<Box<dyn VcpuArch>>>, pvclock_host_tube: &Option<Tube>, run_mode_arc: &VcpuRunMode, region_state: &mut VmMemoryRegionState, vm_control_server: Option<&mut ControlServer>, irq_handler_control: &Tube, device_ctrl_tube: &Tube, wait_ctx: &WaitContext<Token>, force_s2idle: bool, vcpu_control_channels: &[mpsc::Sender<VcpuControl>], ) -> Result<Option<ExitState>>856 fn handle_readable_event<V: VmArch + 'static, Vcpu: VcpuArch + 'static>(
857 event: &TriggeredEvent<Token>,
858 vm_control_ids_to_remove: &mut Vec<usize>,
859 next_control_id: &mut usize,
860 service_vm_state: &mut ServiceVmState,
861 disk_host_tubes: &[Tube],
862 ipc_main_loop_tube: Option<&Tube>,
863 #[cfg(feature = "gpu")] gpu_control_tube: Option<&Tube>,
864 vm_evt_rdtube: &RecvTube,
865 control_tubes: &mut BTreeMap<usize, TaggedControlTube>,
866 guest_os: &mut RunnableLinuxVm<V, Vcpu>,
867 sys_allocator_mutex: &Arc<Mutex<SystemAllocator>>,
868 virtio_snd_host_mute_tube: &mut Option<Tube>,
869 proto_main_loop_tube: Option<&ProtoTube>,
870 anti_tamper_main_thread_tube: &Option<ProtoTube>,
871 #[cfg(feature = "balloon")] mut balloon_tube: Option<&mut BalloonTube>,
872 memory_size_mb: u64,
873 vcpu_boxes: &Mutex<Vec<Box<dyn VcpuArch>>>,
874 pvclock_host_tube: &Option<Tube>,
875 run_mode_arc: &VcpuRunMode,
876 region_state: &mut VmMemoryRegionState,
877 vm_control_server: Option<&mut ControlServer>,
878 irq_handler_control: &Tube,
879 device_ctrl_tube: &Tube,
880 wait_ctx: &WaitContext<Token>,
881 force_s2idle: bool,
882 vcpu_control_channels: &[mpsc::Sender<VcpuControl>],
883 ) -> Result<Option<ExitState>> {
884 let execute_vm_request = |request: VmRequest, guest_os: &mut RunnableLinuxVm<V, Vcpu>| {
885 let mut run_mode_opt = None;
886 let vcpu_size = vcpu_boxes.lock().len();
887 let resp = request.execute(
888 &guest_os.vm,
889 &mut run_mode_opt,
890 disk_host_tubes,
891 &mut guest_os.pm,
892 #[cfg(feature = "gpu")]
893 gpu_control_tube,
894 #[cfg(not(feature = "gpu"))]
895 None,
896 None,
897 &mut None,
898 |msg| {
899 kick_all_vcpus(
900 run_mode_arc,
901 vcpu_control_channels,
902 vcpu_boxes,
903 guest_os.irq_chip.as_ref(),
904 pvclock_host_tube,
905 msg,
906 );
907 },
908 force_s2idle,
909 #[cfg(feature = "swap")]
910 None,
911 device_ctrl_tube,
912 vcpu_size,
913 irq_handler_control,
914 || guest_os.irq_chip.as_ref().snapshot(vcpu_size),
915 );
916 (resp, run_mode_opt)
917 };
918
919 match event.token {
920 Token::VmEvent => match vm_evt_rdtube.recv::<VmEventType>() {
921 Ok(vm_event) => {
922 let exit_state = match vm_event {
923 VmEventType::Exit => {
924 info!("vcpu requested shutdown");
925 Some(ExitState::Stop)
926 }
927 VmEventType::Reset => {
928 info!("vcpu requested reset");
929 Some(ExitState::Reset)
930 }
931 VmEventType::Crash => {
932 info!("vcpu crashed");
933 Some(ExitState::Crash)
934 }
935 VmEventType::Panic(_) => {
936 error!("got pvpanic event. this event is not expected on Windows.");
937 None
938 }
939 VmEventType::WatchdogReset => {
940 info!("vcpu stall detected");
941 Some(ExitState::WatchdogReset)
942 }
943 };
944 return Ok(exit_state);
945 }
946 Err(e) => {
947 warn!("failed to recv VmEvent: {}", e);
948 }
949 },
950 Token::BrokerShutdown => {
951 info!("main loop got broker shutdown event");
952 return Ok(Some(ExitState::Stop));
953 }
954 Token::VmControlServer => {
955 let server =
956 vm_control_server.expect("control server must exist if this event triggers");
957 let client = server.accept();
958 let id = *next_control_id;
959 *next_control_id += 1;
960 wait_ctx
961 .add(client.0.get_read_notifier(), Token::VmControl { id })
962 .exit_context(
963 Exit::WaitContextAdd,
964 "failed to add trigger to wait context",
965 )?;
966 wait_ctx
967 .add(client.0.get_close_notifier(), Token::VmControl { id })
968 .exit_context(
969 Exit::WaitContextAdd,
970 "failed to add trigger to wait context",
971 )?;
972 control_tubes.insert(id, TaggedControlTube::Vm(client));
973 }
974 #[allow(clippy::collapsible_match)]
975 Token::VmControl { id } => {
976 if let Some(tube) = control_tubes.get(&id) {
977 #[allow(clippy::single_match)]
978 match tube {
979 TaggedControlTube::Product(product_tube) => {
980 product::handle_tagged_control_tube_event(
981 product_tube,
982 virtio_snd_host_mute_tube,
983 service_vm_state,
984 ipc_main_loop_tube,
985 )
986 }
987 TaggedControlTube::Vm(tube) => match tube.0.recv::<VmRequest>() {
988 Ok(request) => {
989 let mut run_mode_opt = None;
990 let response = match request {
991 VmRequest::HotPlugVfioCommand { device, add } => {
992 // Suppress warnings.
993 let _ = (device, add);
994 unimplemented!("not implemented on Windows");
995 }
996 #[cfg(feature = "registered_events")]
997 VmRequest::RegisterListener { socket_addr, event } => {
998 unimplemented!("not implemented on Windows");
999 }
1000 #[cfg(feature = "registered_events")]
1001 VmRequest::UnregisterListener { socket_addr, event } => {
1002 unimplemented!("not implemented on Windows");
1003 }
1004 #[cfg(feature = "registered_events")]
1005 VmRequest::Unregister { socket_addr } => {
1006 unimplemented!("not implemented on Windows");
1007 }
1008 #[cfg(feature = "balloon")]
1009 VmRequest::BalloonCommand(cmd) => {
1010 if let Some(balloon_tube) = balloon_tube {
1011 if let Some((r, key)) = balloon_tube.send_cmd(cmd, Some(id))
1012 {
1013 if key != id {
1014 unimplemented!("not implemented on Windows");
1015 }
1016 Some(r)
1017 } else {
1018 None
1019 }
1020 } else {
1021 error!("balloon not enabled");
1022 None
1023 }
1024 }
1025 _ => {
1026 let (resp, run_mode_ret) =
1027 execute_vm_request(request, guest_os);
1028 run_mode_opt = run_mode_ret;
1029 Some(resp)
1030 }
1031 };
1032
1033 if let Some(response) = response {
1034 if let Err(e) = tube.0.send(&response) {
1035 error!("failed to send VmResponse: {}", e);
1036 }
1037 }
1038 if let Some(exit_state) =
1039 handle_run_mode_change_for_vm_request(&run_mode_opt, guest_os)
1040 {
1041 return Ok(Some(exit_state));
1042 }
1043 }
1044 Err(e) => {
1045 if let TubeError::Disconnected = e {
1046 vm_control_ids_to_remove.push(id);
1047 } else {
1048 error!("failed to recv VmRequest: {}", e);
1049 }
1050 }
1051 },
1052 }
1053 }
1054 }
1055 #[cfg(feature = "balloon")]
1056 Token::BalloonTube => match balloon_tube.as_mut().expect("missing balloon tube").recv() {
1057 Ok(resp) => {
1058 for (resp, idx) in resp {
1059 if let Some(TaggedControlTube::Vm(tube)) = control_tubes.get(&idx) {
1060 if let Err(e) = tube.0.send(&resp) {
1061 error!("failed to send VmResponse: {}", e);
1062 }
1063 } else {
1064 error!("Bad tube index {}", idx);
1065 }
1066 }
1067 }
1068 Err(err) => {
1069 error!("Error processing balloon tube {:?}", err)
1070 }
1071 },
1072 #[cfg(not(feature = "balloon"))]
1073 Token::BalloonTube => unreachable!("balloon tube not registered"),
1074 #[allow(unreachable_patterns)]
1075 _ => {
1076 let run_mode_opt = product::handle_received_token(
1077 &event.token,
1078 anti_tamper_main_thread_tube,
1079 #[cfg(feature = "balloon")]
1080 balloon_tube,
1081 control_tubes,
1082 guest_os,
1083 ipc_main_loop_tube,
1084 memory_size_mb,
1085 proto_main_loop_tube,
1086 pvclock_host_tube,
1087 run_mode_arc,
1088 service_vm_state,
1089 vcpu_boxes,
1090 virtio_snd_host_mute_tube,
1091 execute_vm_request,
1092 );
1093 if let Some(exit_state) = handle_run_mode_change_for_vm_request(&run_mode_opt, guest_os)
1094 {
1095 return Ok(Some(exit_state));
1096 }
1097 }
1098 };
1099 Ok(None)
1100 }
1101
1102 /// Handles a run mode change (if one occurred) if one is pending as a
1103 /// result a VmRequest. The parameter, run_mode_opt, is the run mode change
1104 /// proposed by the VmRequest's execution.
1105 ///
1106 /// Returns the exit state, if it changed due to a run mode change.
1107 /// None otherwise.
handle_run_mode_change_for_vm_request<V: VmArch + 'static, Vcpu: VcpuArch + 'static>( run_mode_opt: &Option<VmRunMode>, guest_os: &mut RunnableLinuxVm<V, Vcpu>, ) -> Option<ExitState>1108 fn handle_run_mode_change_for_vm_request<V: VmArch + 'static, Vcpu: VcpuArch + 'static>(
1109 run_mode_opt: &Option<VmRunMode>,
1110 guest_os: &mut RunnableLinuxVm<V, Vcpu>,
1111 ) -> Option<ExitState> {
1112 if let Some(run_mode) = run_mode_opt {
1113 info!("control socket changed run mode to {}", run_mode);
1114 match run_mode {
1115 VmRunMode::Exiting => return Some(ExitState::Stop),
1116 other => {
1117 if other == &VmRunMode::Running {
1118 for dev in &guest_os.resume_notify_devices {
1119 dev.lock().resume_imminent();
1120 }
1121 }
1122 }
1123 }
1124 }
1125 // No exit state change.
1126 None
1127 }
1128
1129 /// Commands to control the VM Memory handler thread.
1130 #[derive(serde::Serialize, serde::Deserialize)]
1131 pub enum VmMemoryHandlerRequest {
1132 /// No response is sent for this command.
1133 Exit,
1134 }
1135
vm_memory_handler_thread( control_tubes: Vec<Tube>, mut vm: impl Vm, sys_allocator_mutex: Arc<Mutex<SystemAllocator>>, mut gralloc: RutabagaGralloc, handler_control: Tube, ) -> anyhow::Result<()>1136 fn vm_memory_handler_thread(
1137 control_tubes: Vec<Tube>,
1138 mut vm: impl Vm,
1139 sys_allocator_mutex: Arc<Mutex<SystemAllocator>>,
1140 mut gralloc: RutabagaGralloc,
1141 handler_control: Tube,
1142 ) -> anyhow::Result<()> {
1143 #[derive(EventToken)]
1144 enum Token {
1145 VmControl { id: usize },
1146 HandlerControl,
1147 }
1148
1149 let wait_ctx =
1150 WaitContext::build_with(&[(handler_control.get_read_notifier(), Token::HandlerControl)])
1151 .context("failed to build wait context")?;
1152 let mut control_tubes = BTreeMap::from_iter(control_tubes.into_iter().enumerate());
1153 for (id, socket) in control_tubes.iter() {
1154 wait_ctx
1155 .add(socket.get_read_notifier(), Token::VmControl { id: *id })
1156 .context("failed to add descriptor to wait context")?;
1157 }
1158
1159 let mut region_state = VmMemoryRegionState::new();
1160
1161 'wait: loop {
1162 let events = {
1163 match wait_ctx.wait() {
1164 Ok(v) => v,
1165 Err(e) => {
1166 error!("failed to poll: {}", e);
1167 break;
1168 }
1169 }
1170 };
1171
1172 let mut vm_control_ids_to_remove = Vec::new();
1173 for event in events.iter().filter(|e| e.is_readable) {
1174 match event.token {
1175 Token::HandlerControl => match handler_control.recv::<VmMemoryHandlerRequest>() {
1176 Ok(request) => match request {
1177 VmMemoryHandlerRequest::Exit => break 'wait,
1178 },
1179 Err(e) => {
1180 if let TubeError::Disconnected = e {
1181 panic!("vm memory control tube disconnected.");
1182 } else {
1183 error!("failed to recv VmMemoryHandlerRequest: {}", e);
1184 }
1185 }
1186 },
1187
1188 Token::VmControl { id } => {
1189 if let Some(tube) = control_tubes.get(&id) {
1190 match tube.recv::<VmMemoryRequest>() {
1191 Ok(request) => {
1192 let response = request.execute(
1193 &mut vm,
1194 &mut sys_allocator_mutex.lock(),
1195 &mut gralloc,
1196 None,
1197 &mut region_state,
1198 );
1199 if let Err(e) = tube.send(&response) {
1200 error!("failed to send VmMemoryControlResponse: {}", e);
1201 }
1202 }
1203 Err(e) => {
1204 if let TubeError::Disconnected = e {
1205 vm_control_ids_to_remove.push(id);
1206 } else {
1207 error!("failed to recv VmMemoryControlRequest: {}", e);
1208 }
1209 }
1210 }
1211 }
1212 }
1213 }
1214 }
1215
1216 remove_closed_tubes(&wait_ctx, &mut control_tubes, vm_control_ids_to_remove)?;
1217 if events
1218 .iter()
1219 .any(|e| e.is_hungup && !e.is_readable && matches!(e.token, Token::HandlerControl))
1220 {
1221 error!("vm memory handler control hung up but did not request an exit.");
1222 break 'wait;
1223 }
1224 }
1225 Ok(())
1226 }
1227
create_control_server( control_server_path: Option<PathBuf>, wait_ctx: &WaitContext<Token>, ) -> Result<Option<ControlServer>>1228 fn create_control_server(
1229 control_server_path: Option<PathBuf>,
1230 wait_ctx: &WaitContext<Token>,
1231 ) -> Result<Option<ControlServer>> {
1232 #[cfg(not(feature = "prod-build"))]
1233 {
1234 if let Some(path) = control_server_path {
1235 let server =
1236 ControlServer::new(path.to_str().expect("control socket path must be a string"))
1237 .exit_context(
1238 Exit::FailedToCreateControlServer,
1239 "failed to create control server",
1240 )?;
1241 wait_ctx
1242 .add(server.client_waiting(), Token::VmControlServer)
1243 .exit_context(
1244 Exit::WaitContextAdd,
1245 "failed to add control server to wait context",
1246 )?;
1247 return Ok(Some(server));
1248 }
1249 }
1250 Ok::<Option<ControlServer>, anyhow::Error>(None)
1251 }
1252
run_control<V: VmArch + 'static, Vcpu: VcpuArch + 'static>( mut guest_os: RunnableLinuxVm<V, Vcpu>, sys_allocator: SystemAllocator, control_tubes: Vec<TaggedControlTube>, irq_control_tubes: Vec<Tube>, vm_memory_control_tubes: Vec<Tube>, vm_evt_rdtube: RecvTube, vm_evt_wrtube: SendTube, #[cfg(feature = "gpu")] gpu_control_tube: Option<Tube>, broker_shutdown_evt: Option<Event>, balloon_host_tube: Option<Tube>, pvclock_host_tube: Option<Tube>, disk_host_tubes: Vec<Tube>, gralloc: RutabagaGralloc, #[cfg(feature = "stats")] stats: Option<Arc<Mutex<StatisticsCollector>>>, service_pipe_name: Option<String>, memory_size_mb: u64, host_cpu_topology: bool, tsc_sync_mitigations: TscSyncMitigations, force_calibrated_tsc_leaf: bool, mut product_args: RunControlArgs, mut virtio_snd_host_mute_tube: Option<Tube>, restore_path: Option<PathBuf>, control_server_path: Option<PathBuf>, force_s2idle: bool, suspended: bool, ) -> Result<ExitState>1253 fn run_control<V: VmArch + 'static, Vcpu: VcpuArch + 'static>(
1254 mut guest_os: RunnableLinuxVm<V, Vcpu>,
1255 sys_allocator: SystemAllocator,
1256 control_tubes: Vec<TaggedControlTube>,
1257 irq_control_tubes: Vec<Tube>,
1258 vm_memory_control_tubes: Vec<Tube>,
1259 vm_evt_rdtube: RecvTube,
1260 vm_evt_wrtube: SendTube,
1261 #[cfg(feature = "gpu")] gpu_control_tube: Option<Tube>,
1262 broker_shutdown_evt: Option<Event>,
1263 balloon_host_tube: Option<Tube>,
1264 pvclock_host_tube: Option<Tube>,
1265 disk_host_tubes: Vec<Tube>,
1266 gralloc: RutabagaGralloc,
1267 #[cfg(feature = "stats")] stats: Option<Arc<Mutex<StatisticsCollector>>>,
1268 service_pipe_name: Option<String>,
1269 memory_size_mb: u64,
1270 host_cpu_topology: bool,
1271 tsc_sync_mitigations: TscSyncMitigations,
1272 force_calibrated_tsc_leaf: bool,
1273 mut product_args: RunControlArgs,
1274 mut virtio_snd_host_mute_tube: Option<Tube>,
1275 restore_path: Option<PathBuf>,
1276 control_server_path: Option<PathBuf>,
1277 force_s2idle: bool,
1278 suspended: bool,
1279 ) -> Result<ExitState> {
1280 let (ipc_main_loop_tube, proto_main_loop_tube, _service_ipc) =
1281 start_service_ipc_listener(service_pipe_name)?;
1282
1283 let mut service_vm_state = product::create_service_vm_state(memory_size_mb);
1284
1285 let sys_allocator_mutex = Arc::new(Mutex::new(sys_allocator));
1286
1287 let exit_evt = Event::new().exit_context(Exit::CreateEvent, "failed to create event")?;
1288 let (irq_handler_control, irq_handler_control_for_worker) = Tube::pair().exit_context(
1289 Exit::CreateTube,
1290 "failed to create IRQ handler control Tube",
1291 )?;
1292
1293 // Create a separate thread to wait on IRQ events. This is a natural division
1294 // because IRQ interrupts have no dependencies on other events, and this lets
1295 // us avoid approaching the Windows WaitForMultipleObjects 64-object limit.
1296 let irq_join_handle = IrqWaitWorker::start(
1297 irq_handler_control_for_worker,
1298 guest_os
1299 .irq_chip
1300 .try_box_clone()
1301 .exit_context(Exit::CloneEvent, "failed to clone irq chip")?,
1302 irq_control_tubes,
1303 sys_allocator_mutex.clone(),
1304 );
1305
1306 let mut triggers = vec![(vm_evt_rdtube.get_read_notifier(), Token::VmEvent)];
1307 product::push_triggers(&mut triggers, &ipc_main_loop_tube, &proto_main_loop_tube);
1308 let wait_ctx = WaitContext::build_with(&triggers).exit_context(
1309 Exit::WaitContextAdd,
1310 "failed to add trigger to wait context",
1311 )?;
1312
1313 #[cfg(feature = "balloon")]
1314 let mut balloon_tube = balloon_host_tube
1315 .map(|tube| -> Result<BalloonTube> {
1316 wait_ctx
1317 .add(tube.get_read_notifier(), Token::BalloonTube)
1318 .context("failed to add trigger to wait context")?;
1319 Ok(BalloonTube::new(tube))
1320 })
1321 .transpose()
1322 .context("failed to create balloon tube")?;
1323
1324 let (vm_memory_handler_control, vm_memory_handler_control_for_thread) = Tube::pair()?;
1325 let vm_memory_handler_thread_join_handle = std::thread::Builder::new()
1326 .name("vm_memory_handler_thread".into())
1327 .spawn({
1328 let vm = guest_os.vm.try_clone().context("failed to clone Vm")?;
1329 let sys_allocator_mutex = sys_allocator_mutex.clone();
1330 move || {
1331 vm_memory_handler_thread(
1332 vm_memory_control_tubes,
1333 vm,
1334 sys_allocator_mutex,
1335 gralloc,
1336 vm_memory_handler_control_for_thread,
1337 )
1338 }
1339 })
1340 .unwrap();
1341
1342 if let Some(evt) = broker_shutdown_evt.as_ref() {
1343 wait_ctx.add(evt, Token::BrokerShutdown).exit_context(
1344 Exit::WaitContextAdd,
1345 "failed to add trigger to wait context",
1346 )?;
1347 }
1348
1349 let mut control_tubes = BTreeMap::from_iter(control_tubes.into_iter().enumerate());
1350 let mut next_control_id = control_tubes.len();
1351 for (id, control_tube) in control_tubes.iter() {
1352 #[allow(clippy::single_match)]
1353 match control_tube {
1354 TaggedControlTube::Product(product_tube) => wait_ctx
1355 .add(
1356 product_tube.get_read_notifier(),
1357 Token::VmControl { id: *id },
1358 )
1359 .exit_context(
1360 Exit::WaitContextAdd,
1361 "failed to add trigger to wait context",
1362 )?,
1363 _ => (),
1364 }
1365 }
1366
1367 let (device_ctrl_tube, device_ctrl_resp) = Tube::pair().context("failed to create tube")?;
1368 guest_os.devices_thread = match create_devices_worker_thread(
1369 guest_os.vm.get_memory().clone(),
1370 guest_os.io_bus.clone(),
1371 guest_os.mmio_bus.clone(),
1372 device_ctrl_resp,
1373 ) {
1374 Ok(join_handle) => Some(join_handle),
1375 Err(e) => {
1376 return Err(anyhow!("Failed to start devices thread: {}", e));
1377 }
1378 };
1379
1380 let vcpus: Vec<Option<_>> = match guest_os.vcpus.take() {
1381 Some(vec) => vec.into_iter().map(|vcpu| Some(vcpu)).collect(),
1382 None => iter::repeat_with(|| None)
1383 .take(guest_os.vcpu_count)
1384 .collect(),
1385 };
1386
1387 let anti_tamper_main_thread_tube = spawn_anti_tamper_thread(&wait_ctx);
1388
1389 let mut vm_control_server = create_control_server(control_server_path, &wait_ctx)?;
1390
1391 let ime_thread = run_ime_thread(&mut product_args, &exit_evt)?;
1392
1393 let original_terminal_mode = stdin().set_raw_mode().ok();
1394
1395 let vcpu_boxes: Arc<Mutex<Vec<Box<dyn VcpuArch>>>> = Arc::new(Mutex::new(Vec::new()));
1396 let run_mode_arc = Arc::new(VcpuRunMode::default());
1397
1398 let run_mode_state = if suspended {
1399 // Sleep devices before creating vcpus.
1400 device_ctrl_tube
1401 .send(&DeviceControlCommand::SleepDevices)
1402 .context("send command to devices control socket")?;
1403 match device_ctrl_tube
1404 .recv()
1405 .context("receive from devices control socket")?
1406 {
1407 VmResponse::Ok => (),
1408 resp => bail!("device sleep failed: {}", resp),
1409 }
1410 run_mode_arc.set_and_notify(VmRunMode::Suspending);
1411 VmRunMode::Suspending
1412 } else {
1413 VmRunMode::Running
1414 };
1415
1416 // If we are restoring from a snapshot, then start suspended.
1417 if restore_path.is_some() {
1418 run_mode_arc.set_and_notify(VmRunMode::Suspending);
1419 }
1420
1421 let (vcpu_threads, vcpu_control_channels) = run_all_vcpus(
1422 vcpus,
1423 vcpu_boxes.clone(),
1424 &guest_os,
1425 &exit_evt,
1426 &vm_evt_wrtube,
1427 #[cfg(feature = "stats")]
1428 &stats,
1429 host_cpu_topology,
1430 run_mode_arc.clone(),
1431 tsc_sync_mitigations,
1432 force_calibrated_tsc_leaf,
1433 )?;
1434
1435 // Restore VM (if applicable).
1436 if let Some(path) = restore_path {
1437 vm_control::do_restore(
1438 path,
1439 &guest_os.vm,
1440 |msg| {
1441 kick_all_vcpus(
1442 run_mode_arc.as_ref(),
1443 &vcpu_control_channels,
1444 vcpu_boxes.as_ref(),
1445 guest_os.irq_chip.as_ref(),
1446 &pvclock_host_tube,
1447 msg,
1448 )
1449 },
1450 |msg, index| {
1451 kick_vcpu(
1452 run_mode_arc.as_ref(),
1453 &vcpu_control_channels,
1454 vcpu_boxes.as_ref(),
1455 guest_os.irq_chip.as_ref(),
1456 &pvclock_host_tube,
1457 index,
1458 msg,
1459 )
1460 },
1461 &irq_handler_control,
1462 &device_ctrl_tube,
1463 guest_os.vcpu_count,
1464 |image| {
1465 guest_os
1466 .irq_chip
1467 .try_box_clone()?
1468 .restore(image, guest_os.vcpu_count)
1469 },
1470 /* require_encrypted= */ false,
1471 )?;
1472 // Allow the vCPUs to start for real.
1473 kick_all_vcpus(
1474 run_mode_arc.as_ref(),
1475 &vcpu_control_channels,
1476 vcpu_boxes.as_ref(),
1477 guest_os.irq_chip.as_ref(),
1478 &pvclock_host_tube,
1479 // Other platforms (unix) have multiple modes they could start in (e.g. starting for
1480 // guest kernel debugging, etc). If/when we support those modes on Windows, we'll need
1481 // to enter that mode here rather than VmRunMode::Running.
1482 VcpuControl::RunState(run_mode_state),
1483 );
1484 }
1485
1486 let mut exit_state = ExitState::Stop;
1487 let mut region_state = VmMemoryRegionState::new();
1488
1489 'poll: loop {
1490 let events = {
1491 match wait_ctx.wait() {
1492 Ok(v) => v,
1493 Err(e) => {
1494 error!("failed to wait: {}", e);
1495 break;
1496 }
1497 }
1498 };
1499
1500 let mut vm_control_ids_to_remove = Vec::new();
1501 for event in events.iter().filter(|e| e.is_readable) {
1502 let state = handle_readable_event(
1503 event,
1504 &mut vm_control_ids_to_remove,
1505 &mut next_control_id,
1506 &mut service_vm_state,
1507 disk_host_tubes.as_slice(),
1508 ipc_main_loop_tube.as_ref(),
1509 #[cfg(feature = "gpu")]
1510 gpu_control_tube.as_ref(),
1511 &vm_evt_rdtube,
1512 &mut control_tubes,
1513 &mut guest_os,
1514 &sys_allocator_mutex,
1515 &mut virtio_snd_host_mute_tube,
1516 proto_main_loop_tube.as_ref(),
1517 &anti_tamper_main_thread_tube,
1518 #[cfg(feature = "balloon")]
1519 balloon_tube.as_mut(),
1520 memory_size_mb,
1521 vcpu_boxes.as_ref(),
1522 &pvclock_host_tube,
1523 run_mode_arc.as_ref(),
1524 &mut region_state,
1525 vm_control_server.as_mut(),
1526 &irq_handler_control,
1527 &device_ctrl_tube,
1528 &wait_ctx,
1529 force_s2idle,
1530 &vcpu_control_channels,
1531 )?;
1532 if let Some(state) = state {
1533 exit_state = state;
1534 break 'poll;
1535 }
1536 }
1537
1538 remove_closed_tubes(&wait_ctx, &mut control_tubes, vm_control_ids_to_remove)?;
1539 }
1540
1541 info!("run_control poll loop completed, forcing vCPUs to exit...");
1542
1543 // VCPU threads MUST see the VmRunMode flag, otherwise they may re-enter the VM.
1544 run_mode_arc.set_and_notify(VmRunMode::Exiting);
1545
1546 // Force all vcpus to exit from the hypervisor
1547 for vcpu in vcpu_boxes.lock().iter() {
1548 vcpu.set_immediate_exit(true);
1549 }
1550
1551 let mut res = Ok(exit_state);
1552 guest_os.irq_chip.kick_halted_vcpus();
1553 let _ = exit_evt.signal();
1554
1555 if guest_os.devices_thread.is_some() {
1556 if let Err(e) = device_ctrl_tube.send(&DeviceControlCommand::Exit) {
1557 error!("failed to stop device control loop: {}", e);
1558 };
1559 if let Some(thread) = guest_os.devices_thread.take() {
1560 if let Err(e) = thread.join() {
1561 error!("failed to exit devices thread: {:?}", e);
1562 }
1563 }
1564 }
1565
1566 // Shut down the VM memory handler thread.
1567 if let Err(e) = vm_memory_handler_control.send(&VmMemoryHandlerRequest::Exit) {
1568 error!(
1569 "failed to request exit from VM memory handler thread: {}",
1570 e
1571 );
1572 }
1573 if let Err(e) = vm_memory_handler_thread_join_handle.join() {
1574 error!("failed to exit VM Memory handler thread: {:?}", e);
1575 }
1576
1577 // Shut down the IRQ handler thread.
1578 if let Err(e) = irq_handler_control.send(&IrqHandlerRequest::Exit) {
1579 error!("failed to request exit from IRQ handler thread: {}", e);
1580 }
1581
1582 // Ensure any child threads have ended by sending the Exit vm event (possibly again) to ensure
1583 // their run loops are aborted.
1584 let _ = vm_evt_wrtube.send::<VmEventType>(&VmEventType::Exit);
1585 for (i, thread) in vcpu_threads.into_iter().enumerate() {
1586 // wait till all the threads exit, so that guest_os.vm arc memory count is down to 1.
1587 // otherwise, we will hit a memory leak if we force kill the thread with terminate.
1588 match thread.join() {
1589 Ok(Err(e)) => {
1590 error!("vcpu thread {} exited with an error: {}", i, e);
1591 res = Err(e);
1592 }
1593 Ok(_) => {}
1594 Err(e) => error!("vcpu thread {} panicked: {:?}", i, e),
1595 }
1596 }
1597
1598 info!("vCPU threads have exited.");
1599
1600 if let Some(ime) = ime_thread {
1601 match ime.join() {
1602 Ok(Err(e)) => {
1603 error!("ime thread exited with an error: {}", e);
1604 if res.is_ok() {
1605 // Prioritize past errors, but return this error if it is unique, otherwise just
1606 // log it.
1607 res = Err(e)
1608 }
1609 }
1610 Ok(_) => {}
1611 Err(e) => error!("ime thread panicked: {:?}", e),
1612 }
1613 }
1614 info!("IME thread has exited.");
1615
1616 // This cancels all the outstanding and any future blocking operations.
1617 // TODO(b/196911556): Shutdown executor for cleaner shutdown. Given we are using global, for a
1618 // cleaner shutdown we have to call disarm so that all the incoming requests are run and are
1619 // cancelled. If we call shutdown all blocking threads will go away and incoming operations
1620 // won't be scheduled to run and will be dropped leading to panic. I think ideal place to call
1621 // shutdown is when we drop non-global executor.
1622 cros_async::unblock_disarm();
1623 info!("blocking async pool has shut down.");
1624
1625 let _ = irq_join_handle.join();
1626 info!("IrqWaitWorker has shut down.");
1627
1628 #[cfg(feature = "stats")]
1629 if let Some(stats) = stats {
1630 println!("Statistics Collected:\n{}", stats.lock());
1631 println!("Statistics JSON:\n{}", stats.lock().json());
1632 }
1633
1634 if let Some(mode) = original_terminal_mode {
1635 if let Err(e) = stdin().restore_mode(mode) {
1636 warn!("failed to restore terminal mode: {}", e);
1637 }
1638 }
1639
1640 // Explicitly drop the VM structure here to allow the devices to clean up before the
1641 // control tubes are closed when this function exits.
1642 mem::drop(guest_os);
1643
1644 info!("guest_os dropped, run_control is done.");
1645
1646 res
1647 }
1648
1649 /// Remove Tubes that have been closed from the WaitContext.
remove_closed_tubes<T, U>( wait_ctx: &WaitContext<T>, tubes: &mut BTreeMap<usize, U>, mut tube_ids_to_remove: Vec<usize>, ) -> anyhow::Result<()> where T: EventToken, U: ReadNotifier + CloseNotifier,1650 fn remove_closed_tubes<T, U>(
1651 wait_ctx: &WaitContext<T>,
1652 tubes: &mut BTreeMap<usize, U>,
1653 mut tube_ids_to_remove: Vec<usize>,
1654 ) -> anyhow::Result<()>
1655 where
1656 T: EventToken,
1657 U: ReadNotifier + CloseNotifier,
1658 {
1659 tube_ids_to_remove.dedup();
1660 for id in tube_ids_to_remove {
1661 if let Some(socket) = tubes.remove(&id) {
1662 wait_ctx
1663 .delete(socket.get_read_notifier())
1664 .context("failed to remove descriptor from wait context")?;
1665
1666 // There may be a close notifier registered for this Tube. If there isn't one
1667 // registered, we just ignore the error.
1668 let _ = wait_ctx.delete(socket.get_close_notifier());
1669 }
1670 }
1671 Ok(())
1672 }
1673
1674 /// Sends a message to all VCPUs.
kick_all_vcpus( run_mode: &VcpuRunMode, vcpu_control_channels: &[mpsc::Sender<VcpuControl>], vcpu_boxes: &Mutex<Vec<Box<dyn VcpuArch>>>, irq_chip: &dyn IrqChipArch, pvclock_host_tube: &Option<Tube>, msg: VcpuControl, )1675 fn kick_all_vcpus(
1676 run_mode: &VcpuRunMode,
1677 vcpu_control_channels: &[mpsc::Sender<VcpuControl>],
1678 vcpu_boxes: &Mutex<Vec<Box<dyn VcpuArch>>>,
1679 irq_chip: &dyn IrqChipArch,
1680 pvclock_host_tube: &Option<Tube>,
1681 msg: VcpuControl,
1682 ) {
1683 // On Windows, we handle run mode switching directly rather than delegating to the VCPU thread
1684 // like unix does.
1685 match &msg {
1686 VcpuControl::RunState(VmRunMode::Suspending) => {
1687 suspend_all_vcpus(run_mode, vcpu_boxes, irq_chip, pvclock_host_tube);
1688 return;
1689 }
1690 VcpuControl::RunState(VmRunMode::Running) => {
1691 resume_all_vcpus(run_mode, vcpu_boxes, irq_chip, pvclock_host_tube);
1692 return;
1693 }
1694 _ => (),
1695 }
1696
1697 // For non RunState commands, we dispatch just like unix would.
1698 for vcpu in vcpu_control_channels {
1699 if let Err(e) = vcpu.send(msg.clone()) {
1700 error!("failed to send VcpuControl message: {}", e);
1701 }
1702 }
1703
1704 // Now that we've sent a message, we need VCPUs to exit so they can process it.
1705 for vcpu in vcpu_boxes.lock().iter() {
1706 vcpu.set_immediate_exit(true);
1707 }
1708 irq_chip.kick_halted_vcpus();
1709
1710 // If the VCPU isn't running, we have to notify the run_mode condvar to wake it so it processes
1711 // the control message.
1712 let current_run_mode = run_mode.get_mode();
1713 if current_run_mode != VmRunMode::Running {
1714 run_mode.set_and_notify(current_run_mode);
1715 }
1716 }
1717
1718 /// Sends a message to a single VCPU. On Windows, `VcpuControl::RunState` cannot be sent to a single
1719 /// VCPU.
kick_vcpu( run_mode: &VcpuRunMode, vcpu_control_channels: &[mpsc::Sender<VcpuControl>], vcpu_boxes: &Mutex<Vec<Box<dyn VcpuArch>>>, irq_chip: &dyn IrqChipArch, pvclock_host_tube: &Option<Tube>, index: usize, msg: VcpuControl, )1720 fn kick_vcpu(
1721 run_mode: &VcpuRunMode,
1722 vcpu_control_channels: &[mpsc::Sender<VcpuControl>],
1723 vcpu_boxes: &Mutex<Vec<Box<dyn VcpuArch>>>,
1724 irq_chip: &dyn IrqChipArch,
1725 pvclock_host_tube: &Option<Tube>,
1726 index: usize,
1727 msg: VcpuControl,
1728 ) {
1729 assert!(
1730 !matches!(msg, VcpuControl::RunState(_)),
1731 "Windows does not support RunState changes on a per VCPU basis"
1732 );
1733
1734 let vcpu = vcpu_control_channels
1735 .get(index)
1736 .expect("invalid vcpu index specified");
1737 if let Err(e) = vcpu.send(msg) {
1738 error!("failed to send VcpuControl message: {}", e);
1739 }
1740
1741 // Now that we've sent a message, we need the VCPU to exit so it can
1742 // process the message.
1743 vcpu_boxes
1744 .lock()
1745 .get(index)
1746 .expect("invalid vcpu index specified")
1747 .set_immediate_exit(true);
1748 irq_chip.kick_halted_vcpus();
1749
1750 // If the VCPU isn't running, we have to notify the run_mode condvar to wake it so it processes
1751 // the control message. (Technically this wakes all VCPUs, but those without messages will go
1752 // back to sleep.)
1753 let current_run_mode = run_mode.get_mode();
1754 if current_run_mode != VmRunMode::Running {
1755 run_mode.set_and_notify(current_run_mode);
1756 }
1757 }
1758
1759 /// Suspends all VCPUs. The VM will be effectively frozen in time once this function is called,
1760 /// though devices on the host will continue to run.
suspend_all_vcpus( run_mode: &VcpuRunMode, vcpu_boxes: &Mutex<Vec<Box<dyn VcpuArch>>>, irq_chip: &dyn IrqChipArch, pvclock_host_tube: &Option<Tube>, )1761 pub(crate) fn suspend_all_vcpus(
1762 run_mode: &VcpuRunMode,
1763 vcpu_boxes: &Mutex<Vec<Box<dyn VcpuArch>>>,
1764 irq_chip: &dyn IrqChipArch,
1765 pvclock_host_tube: &Option<Tube>,
1766 ) {
1767 // VCPU threads MUST see the VmRunMode::Suspending flag first, otherwise
1768 // they may re-enter the VM.
1769 run_mode.set_and_notify(VmRunMode::Suspending);
1770
1771 // Force all vcpus to exit from the hypervisor
1772 for vcpu in vcpu_boxes.lock().iter() {
1773 vcpu.set_immediate_exit(true);
1774 }
1775 irq_chip.kick_halted_vcpus();
1776
1777 handle_pvclock_request(pvclock_host_tube, PvClockCommand::Suspend)
1778 .unwrap_or_else(|e| error!("Error handling pvclock suspend: {:?}", e));
1779 }
1780
1781 /// Resumes all VCPUs.
resume_all_vcpus( run_mode: &VcpuRunMode, vcpu_boxes: &Mutex<Vec<Box<dyn VcpuArch>>>, irq_chip: &dyn IrqChipArch, pvclock_host_tube: &Option<Tube>, )1782 pub(crate) fn resume_all_vcpus(
1783 run_mode: &VcpuRunMode,
1784 vcpu_boxes: &Mutex<Vec<Box<dyn VcpuArch>>>,
1785 irq_chip: &dyn IrqChipArch,
1786 pvclock_host_tube: &Option<Tube>,
1787 ) {
1788 handle_pvclock_request(pvclock_host_tube, PvClockCommand::Resume)
1789 .unwrap_or_else(|e| error!("Error handling pvclock resume: {:?}", e));
1790
1791 // Make sure any immediate exit bits are disabled
1792 for vcpu in vcpu_boxes.lock().iter() {
1793 vcpu.set_immediate_exit(false);
1794 }
1795
1796 run_mode.set_and_notify(VmRunMode::Running);
1797 }
1798
1799 #[cfg(feature = "gvm")]
1800 const GVM_MINIMUM_VERSION: GvmVersion = GvmVersion {
1801 major: 1,
1802 minor: 4,
1803 patch: 1,
1804 };
1805
1806 #[cfg(feature = "gvm")]
create_gvm_vm(gvm: Gvm, mem: GuestMemory) -> Result<GvmVm>1807 fn create_gvm_vm(gvm: Gvm, mem: GuestMemory) -> Result<GvmVm> {
1808 match gvm.get_full_version() {
1809 Ok(version) => {
1810 if version < GVM_MINIMUM_VERSION {
1811 error!(
1812 "GVM version {} is below minimum version {}",
1813 version, GVM_MINIMUM_VERSION
1814 );
1815 return Err(base::Error::new(libc::ENXIO).into());
1816 } else {
1817 info!("Using GVM version {}.", version)
1818 }
1819 }
1820 Err(e) => {
1821 error!("unable to determine gvm version: {}", e);
1822 return Err(base::Error::new(libc::ENXIO).into());
1823 }
1824 }
1825 let vm = GvmVm::new(&gvm, mem)?;
1826 Ok(vm)
1827 }
1828
1829 #[cfg(feature = "haxm")]
create_haxm_vm( haxm: Haxm, mem: GuestMemory, kernel_log_file: &Option<String>, ) -> Result<HaxmVm>1830 fn create_haxm_vm(
1831 haxm: Haxm,
1832 mem: GuestMemory,
1833 kernel_log_file: &Option<String>,
1834 ) -> Result<HaxmVm> {
1835 let vm = HaxmVm::new(&haxm, mem)?;
1836 if let Some(path) = kernel_log_file {
1837 use hypervisor::haxm::HAX_CAP_VM_LOG;
1838 if vm.check_raw_capability(HAX_CAP_VM_LOG) {
1839 match vm.register_log_file(path) {
1840 Ok(_) => {}
1841 Err(e) => match e.errno() {
1842 libc::E2BIG => {
1843 error!(
1844 "kernel_log_file path is too long, kernel log file will not be written"
1845 );
1846 }
1847 _ => return Err(e.into()),
1848 },
1849 }
1850 } else {
1851 warn!(
1852 "kernel_log_file specified but this version of HAXM does not support kernel log \
1853 files"
1854 );
1855 }
1856 }
1857 Ok(vm)
1858 }
1859
1860 #[cfg(feature = "whpx")]
1861 #[cfg(target_arch = "x86_64")]
create_whpx_vm( whpx: Whpx, mem: GuestMemory, cpu_count: usize, no_smt: bool, apic_emulation: bool, force_calibrated_tsc_leaf: bool, vm_evt_wrtube: SendTube, ) -> Result<WhpxVm>1862 fn create_whpx_vm(
1863 whpx: Whpx,
1864 mem: GuestMemory,
1865 cpu_count: usize,
1866 no_smt: bool,
1867 apic_emulation: bool,
1868 force_calibrated_tsc_leaf: bool,
1869 vm_evt_wrtube: SendTube,
1870 ) -> Result<WhpxVm> {
1871 let cpu_config = hypervisor::CpuConfigX86_64::new(
1872 force_calibrated_tsc_leaf,
1873 false, /* host_cpu_topology */
1874 false, /* enable_hwp */
1875 no_smt,
1876 false, /* itmt */
1877 None, /* hybrid_type */
1878 );
1879
1880 // context for non-cpu-specific cpuid results
1881 let ctx = CpuIdContext::new(
1882 0,
1883 cpu_count,
1884 None,
1885 cpu_config,
1886 whpx.check_capability(HypervisorCap::CalibratedTscLeafRequired),
1887 __cpuid_count,
1888 __cpuid,
1889 );
1890
1891 // Get all cpuid entries that we should pre-set
1892 let mut cpuid = whpx.get_supported_cpuid()?;
1893
1894 // Adjust them for crosvm
1895 for entry in cpuid.cpu_id_entries.iter_mut() {
1896 adjust_cpuid(entry, &ctx);
1897 }
1898
1899 let vm = WhpxVm::new(
1900 &whpx,
1901 cpu_count,
1902 mem,
1903 cpuid,
1904 apic_emulation,
1905 Some(vm_evt_wrtube),
1906 )
1907 .exit_context(Exit::WhpxSetupError, "failed to create WHPX vm")?;
1908
1909 Ok(vm)
1910 }
1911
1912 #[cfg(feature = "gvm")]
create_gvm_irq_chip(vm: &GvmVm, vcpu_count: usize) -> base::Result<GvmIrqChip>1913 fn create_gvm_irq_chip(vm: &GvmVm, vcpu_count: usize) -> base::Result<GvmIrqChip> {
1914 info!("Creating GVM irqchip");
1915 let irq_chip = GvmIrqChip::new(vm.try_clone()?, vcpu_count)?;
1916 Ok(irq_chip)
1917 }
1918
1919 #[cfg(feature = "whpx")]
1920 #[cfg(target_arch = "x86_64")]
create_whpx_split_irq_chip( vm: &WhpxVm, ioapic_device_tube: Tube, ) -> base::Result<WhpxSplitIrqChip>1921 fn create_whpx_split_irq_chip(
1922 vm: &WhpxVm,
1923 ioapic_device_tube: Tube,
1924 ) -> base::Result<WhpxSplitIrqChip> {
1925 info!("Creating WHPX split irqchip");
1926 WhpxSplitIrqChip::new(
1927 vm.try_clone()?,
1928 ioapic_device_tube,
1929 None, // ioapic_pins
1930 )
1931 }
1932
create_userspace_irq_chip<Vcpu>( vcpu_count: usize, ioapic_device_tube: Tube, ) -> base::Result<UserspaceIrqChip<Vcpu>> where Vcpu: VcpuArch + 'static,1933 fn create_userspace_irq_chip<Vcpu>(
1934 vcpu_count: usize,
1935 ioapic_device_tube: Tube,
1936 ) -> base::Result<UserspaceIrqChip<Vcpu>>
1937 where
1938 Vcpu: VcpuArch + 'static,
1939 {
1940 info!("Creating userspace irqchip");
1941 let irq_chip =
1942 UserspaceIrqChip::new(vcpu_count, ioapic_device_tube, /* ioapic_pins: */ None)?;
1943 Ok(irq_chip)
1944 }
1945
get_default_hypervisor() -> Option<HypervisorKind>1946 pub fn get_default_hypervisor() -> Option<HypervisorKind> {
1947 // The ordering here matters from most preferable to the least.
1948 #[cfg(feature = "whpx")]
1949 match hypervisor::whpx::Whpx::is_enabled() {
1950 true => return Some(HypervisorKind::Whpx),
1951 false => warn!("Whpx not enabled."),
1952 };
1953
1954 #[cfg(feature = "haxm")]
1955 if get_cpu_manufacturer() == CpuManufacturer::Intel {
1956 // Make sure Haxm device can be opened before selecting it.
1957 match Haxm::new() {
1958 Ok(_) => return Some(HypervisorKind::Ghaxm),
1959 Err(e) => warn!("Cannot initialize HAXM: {}", e),
1960 };
1961 }
1962
1963 #[cfg(feature = "gvm")]
1964 // Make sure Gvm device can be opened before selecting it.
1965 match Gvm::new() {
1966 Ok(_) => return Some(HypervisorKind::Gvm),
1967 Err(e) => warn!("Cannot initialize GVM: {}", e),
1968 };
1969
1970 None
1971 }
1972
setup_vm_components(cfg: &Config) -> Result<VmComponents>1973 fn setup_vm_components(cfg: &Config) -> Result<VmComponents> {
1974 let initrd_image = if let Some(initrd_path) = &cfg.initrd_path {
1975 Some(
1976 File::open(initrd_path).with_exit_context(Exit::OpenInitrd, || {
1977 format!("failed to open initrd {}", initrd_path.display())
1978 })?,
1979 )
1980 } else {
1981 None
1982 };
1983
1984 let vm_image = match cfg.executable_path {
1985 Some(Executable::Kernel(ref kernel_path)) => VmImage::Kernel(
1986 File::open(kernel_path).with_exit_context(Exit::OpenKernel, || {
1987 format!("failed to open kernel image {}", kernel_path.display(),)
1988 })?,
1989 ),
1990 Some(Executable::Bios(ref bios_path)) => {
1991 VmImage::Bios(File::open(bios_path).with_exit_context(Exit::OpenBios, || {
1992 format!("failed to open bios {}", bios_path.display())
1993 })?)
1994 }
1995 _ => panic!("Did not receive a bios or kernel, should be impossible."),
1996 };
1997
1998 let swiotlb = if let Some(size) = cfg.swiotlb {
1999 Some(
2000 size.checked_mul(1024 * 1024)
2001 .ok_or_else(|| anyhow!("requested swiotlb size too large"))?,
2002 )
2003 } else if matches!(cfg.protection_type, ProtectionType::Unprotected) {
2004 None
2005 } else {
2006 Some(64 * 1024 * 1024)
2007 };
2008
2009 let (pflash_image, pflash_block_size) = if let Some(pflash_parameters) = &cfg.pflash_parameters
2010 {
2011 (
2012 Some(
2013 open_file_or_duplicate(
2014 &pflash_parameters.path,
2015 OpenOptions::new().read(true).write(true),
2016 )
2017 .with_context(|| {
2018 format!("failed to open pflash {}", pflash_parameters.path.display())
2019 })?,
2020 ),
2021 pflash_parameters.block_size,
2022 )
2023 } else {
2024 (None, 0)
2025 };
2026
2027 Ok(VmComponents {
2028 memory_size: cfg
2029 .memory
2030 .unwrap_or(256)
2031 .checked_mul(1024 * 1024)
2032 .ok_or_else(|| anyhow!("requested memory size too large"))?,
2033 swiotlb,
2034 vcpu_count: cfg.vcpu_count.unwrap_or(1),
2035 fw_cfg_enable: false,
2036 bootorder_fw_cfg_blob: Vec::new(),
2037 vcpu_affinity: cfg.vcpu_affinity.clone(),
2038 cpu_clusters: cfg.cpu_clusters.clone(),
2039 cpu_capacity: cfg.cpu_capacity.clone(),
2040 no_smt: cfg.no_smt,
2041 hugepages: cfg.hugepages,
2042 hv_cfg: hypervisor::Config {
2043 protection_type: cfg.protection_type,
2044 },
2045 vm_image,
2046 android_fstab: cfg
2047 .android_fstab
2048 .as_ref()
2049 .map(|x| {
2050 File::open(x).with_exit_context(Exit::OpenAndroidFstab, || {
2051 format!("failed to open android fstab file {}", x.display())
2052 })
2053 })
2054 .map_or(Ok(None), |v| v.map(Some))?,
2055 pstore: cfg.pstore.clone(),
2056 pflash_block_size,
2057 pflash_image,
2058 initrd_image,
2059 extra_kernel_params: cfg.params.clone(),
2060 acpi_sdts: cfg
2061 .acpi_tables
2062 .iter()
2063 .map(|path| {
2064 SDT::from_file(path).with_exit_context(Exit::OpenAcpiTable, || {
2065 format!("failed to open ACPI file {}", path.display())
2066 })
2067 })
2068 .collect::<Result<Vec<SDT>>>()?,
2069 rt_cpus: cfg.rt_cpus.clone(),
2070 delay_rt: cfg.delay_rt,
2071 no_i8042: cfg.no_i8042,
2072 no_rtc: cfg.no_rtc,
2073 host_cpu_topology: cfg.host_cpu_topology,
2074 #[cfg(target_arch = "x86_64")]
2075 force_s2idle: cfg.force_s2idle,
2076 fw_cfg_parameters: cfg.fw_cfg_parameters.clone(),
2077 itmt: false,
2078 pvm_fw: None,
2079 #[cfg(target_arch = "x86_64")]
2080 pci_low_start: cfg.pci_low_start,
2081 #[cfg(target_arch = "x86_64")]
2082 pcie_ecam: cfg.pcie_ecam,
2083 #[cfg(target_arch = "x86_64")]
2084 smbios: cfg.smbios.clone(),
2085 dynamic_power_coefficient: cfg.dynamic_power_coefficient.clone(),
2086 #[cfg(target_arch = "x86_64")]
2087 break_linux_pci_config_io: cfg.break_linux_pci_config_io,
2088 boot_cpu: cfg.boot_cpu,
2089 })
2090 }
2091
2092 // Enum that allows us to assign a variable to what is essentially a &dyn IrqChipArch.
2093 enum WindowsIrqChip<V: VcpuArch> {
2094 Userspace(UserspaceIrqChip<V>),
2095 #[cfg(feature = "gvm")]
2096 Gvm(GvmIrqChip),
2097 #[cfg(feature = "whpx")]
2098 WhpxSplit(WhpxSplitIrqChip),
2099 }
2100
2101 impl<V: VcpuArch> WindowsIrqChip<V> {
2102 // Convert our enum to a &mut dyn IrqChipArch
as_mut(&mut self) -> &mut dyn IrqChipArch2103 fn as_mut(&mut self) -> &mut dyn IrqChipArch {
2104 match self {
2105 WindowsIrqChip::Userspace(i) => i,
2106 #[cfg(feature = "gvm")]
2107 WindowsIrqChip::Gvm(i) => i,
2108 #[cfg(feature = "whpx")]
2109 WindowsIrqChip::WhpxSplit(i) => i,
2110 }
2111 }
2112 }
2113
2114 /// Storage for the VM TSC offset for each vcpu. Stored in a static because the tracing thread will
2115 /// need access to it when tracing is enabled.
2116 static TSC_OFFSETS: sync::Mutex<Vec<Option<u64>>> = sync::Mutex::new(Vec::new());
2117
2118 /// Save the TSC offset for a particular vcpu.
2119 ///
2120 /// After setting the TSC offset for a vcpu, this function checks the standard deviation of offsets
2121 /// for all the VCPUs and logs this information. If the TSC offsets differ too much between vcpus
2122 /// it can cause clock issues in the guest.
save_vcpu_tsc_offset(offset: u64, vcpu_id: usize)2123 pub fn save_vcpu_tsc_offset(offset: u64, vcpu_id: usize) {
2124 let offsets_copy = {
2125 let mut offsets = TSC_OFFSETS.lock();
2126 // make sure offsets vec is large enough before inserting
2127 let newlen = std::cmp::max(offsets.len(), vcpu_id + 1);
2128 offsets.resize(newlen, None);
2129 offsets[vcpu_id] = Some(offset);
2130
2131 offsets.clone()
2132 };
2133
2134 // do statistics on a clone of the offsets so we don't hold up other vcpus at this point
2135 info!(
2136 "TSC offset standard deviation is: {}",
2137 standard_deviation(
2138 &offsets_copy
2139 .iter()
2140 .filter(|x| x.is_some())
2141 .map(|x| x.unwrap() as u128)
2142 .collect::<Vec<u128>>()
2143 )
2144 );
2145 }
2146
2147 /// Get the TSC offset of any vcpu. It will pick the first non-None offset it finds in TSC_OFFSETS.
2148 #[cfg(feature = "perfetto")]
get_vcpu_tsc_offset() -> u642149 pub fn get_vcpu_tsc_offset() -> u64 {
2150 if let Some(offset) = TSC_OFFSETS.lock().iter().flatten().next() {
2151 return *offset;
2152 }
2153 0
2154 }
2155
2156 /// Callback that is registered with tracing crate, and will be called by the tracing thread when
2157 /// tracing is enabled or disabled. Regardless of whether tracing is being enabled or disabled for
2158 /// a given category or instance, we just emit a clock snapshot that maps the guest TSC to the
2159 /// host TSC. Redundant snapshots should not be a problem for perfetto.
2160 #[cfg(feature = "perfetto")]
set_tsc_clock_snapshot()2161 fn set_tsc_clock_snapshot() {
2162 let freq = match devices::tsc::tsc_frequency() {
2163 Err(e) => {
2164 error!(
2165 "Could not determine tsc frequency, unable to snapshot tsc offset: {}",
2166 e
2167 );
2168 return;
2169 }
2170 Ok(freq) => freq,
2171 };
2172
2173 // The offset is host-guest tsc value
2174 let offset = get_vcpu_tsc_offset();
2175 // Safe because _rdtsc takes no arguments;
2176 let host_tsc = unsafe { std::arch::x86_64::_rdtsc() };
2177 perfetto::snapshot_clock(perfetto::ClockSnapshot::new(
2178 // Technically our multiplier should be freq/1_000_000_000, but perfetto doesn't
2179 // support floating point multipliers yet. So for now we set the freq in Hz and rely
2180 // on the merge tool to fix it.
2181 perfetto::Clock::new(
2182 perfetto::BuiltinClock::Tsc as u32,
2183 host_tsc.wrapping_add(offset),
2184 )
2185 .set_multiplier(freq as u64),
2186 perfetto::Clock::new(
2187 // The host builtin clock ids are all offset from the guest ids by
2188 // HOST_GUEST_CLOCK_ID_OFFSET when the traces are merged. Because this snapshot
2189 // contains both a guest and host clock, we need to offset it before merge.
2190 perfetto::BuiltinClock::Tsc as u32 + cros_tracing::HOST_GUEST_CLOCK_ID_OFFSET,
2191 host_tsc,
2192 )
2193 .set_multiplier(freq as u64),
2194 ));
2195 }
2196
2197 /// Launches run_config for the broker, reading configuration from a TubeTransporter.
run_config_for_broker(raw_tube_transporter: RawDescriptor) -> Result<ExitState>2198 pub fn run_config_for_broker(raw_tube_transporter: RawDescriptor) -> Result<ExitState> {
2199 let tube_transporter =
2200 // SAFETY:
2201 // Safe because we know that raw_transport_tube is valid (passed by inheritance), and that
2202 // the blocking & framing modes are accurate because we create them ourselves in the broker.
2203 unsafe { TubeTransporterReader::from_raw_descriptor(raw_tube_transporter) };
2204
2205 let mut tube_data_list = tube_transporter
2206 .read_tubes()
2207 .exit_context(Exit::TubeTransporterInit, "failed to init tube transporter")?;
2208
2209 let bootstrap_tube = tube_data_list
2210 .get_tube(TubeToken::Bootstrap)
2211 .exit_context(Exit::TubeFailure, "failed to get bootstrap tube")?;
2212
2213 let mut cfg: Config = bootstrap_tube
2214 .recv::<Config>()
2215 .exit_context(Exit::TubeFailure, "failed to read bootstrap tube")?;
2216
2217 let startup_args: CommonChildStartupArgs = bootstrap_tube
2218 .recv::<CommonChildStartupArgs>()
2219 .exit_context(Exit::TubeFailure, "failed to read bootstrap tube")?;
2220 let _child_cleanup = common_child_setup(startup_args).exit_context(
2221 Exit::CommonChildSetupError,
2222 "failed to perform common child setup",
2223 )?;
2224
2225 cfg.broker_shutdown_event = Some(
2226 bootstrap_tube
2227 .recv::<Event>()
2228 .exit_context(Exit::TubeFailure, "failed to read bootstrap tube")?,
2229 );
2230 #[cfg(feature = "crash-report")]
2231 let crash_tube_map = bootstrap_tube
2232 .recv::<HashMap<ProcessType, Vec<SendTube>>>()
2233 .exit_context(Exit::TubeFailure, "failed to read bootstrap tube")?;
2234 #[cfg(feature = "crash-report")]
2235 crash_report::set_crash_tube_map(crash_tube_map);
2236
2237 let BrokerTubes {
2238 vm_evt_wrtube,
2239 vm_evt_rdtube,
2240 } = bootstrap_tube
2241 .recv::<BrokerTubes>()
2242 .exit_context(Exit::TubeFailure, "failed to read bootstrap tube")?;
2243
2244 run_config_inner(cfg, vm_evt_wrtube, vm_evt_rdtube)
2245 }
2246
run_config(cfg: Config) -> Result<ExitState>2247 pub fn run_config(cfg: Config) -> Result<ExitState> {
2248 let _raise_timer_resolution = enable_high_res_timers()
2249 .exit_context(Exit::EnableHighResTimer, "failed to enable high res timer")?;
2250
2251 // There is no broker when using run_config(), so the vm_evt tubes need to be created.
2252 let (vm_evt_wrtube, vm_evt_rdtube) =
2253 Tube::directional_pair().context("failed to create vm event tube")?;
2254
2255 run_config_inner(cfg, vm_evt_wrtube, vm_evt_rdtube)
2256 }
2257
create_guest_memory( components: &VmComponents, hypervisor: &impl Hypervisor, ) -> Result<GuestMemory>2258 fn create_guest_memory(
2259 components: &VmComponents,
2260 hypervisor: &impl Hypervisor,
2261 ) -> Result<GuestMemory> {
2262 let guest_mem_layout = Arch::guest_memory_layout(components, hypervisor).exit_context(
2263 Exit::GuestMemoryLayout,
2264 "failed to create guest memory layout",
2265 )?;
2266 GuestMemory::new_with_options(&guest_mem_layout)
2267 .exit_context(Exit::CreateGuestMemory, "failed to create guest memory")
2268 }
2269
run_config_inner( cfg: Config, vm_evt_wrtube: SendTube, vm_evt_rdtube: RecvTube, ) -> Result<ExitState>2270 fn run_config_inner(
2271 cfg: Config,
2272 vm_evt_wrtube: SendTube,
2273 vm_evt_rdtube: RecvTube,
2274 ) -> Result<ExitState> {
2275 product::setup_common_metric_invariants(&cfg);
2276
2277 #[cfg(feature = "perfetto")]
2278 cros_tracing::add_per_trace_callback(set_tsc_clock_snapshot);
2279
2280 let components: VmComponents = setup_vm_components(&cfg)?;
2281
2282 #[allow(unused_mut)]
2283 let mut hypervisor = cfg
2284 .hypervisor
2285 .or_else(get_default_hypervisor)
2286 .exit_context(Exit::NoDefaultHypervisor, "no enabled hypervisor")?;
2287
2288 #[cfg(feature = "whpx")]
2289 if hypervisor::whpx::Whpx::is_enabled() {
2290 // If WHPX is enabled, no other hypervisor can be used, so just override it
2291 hypervisor = HypervisorKind::Whpx;
2292 }
2293
2294 match hypervisor {
2295 #[cfg(feature = "haxm")]
2296 HypervisorKind::Haxm | HypervisorKind::Ghaxm => {
2297 if hypervisor == HypervisorKind::Haxm {
2298 set_use_ghaxm(false);
2299 }
2300 info!("Creating HAXM ghaxm={}", get_use_ghaxm());
2301 let haxm = Haxm::new()?;
2302 let guest_mem = create_guest_memory(&components, &haxm)?;
2303 let vm = create_haxm_vm(haxm, guest_mem, &cfg.kernel_log_file)?;
2304 let (ioapic_host_tube, ioapic_device_tube) =
2305 Tube::pair().exit_context(Exit::CreateTube, "failed to create tube")?;
2306 let irq_chip =
2307 create_userspace_irq_chip::<HaxmVcpu>(components.vcpu_count, ioapic_device_tube)?;
2308 run_vm::<HaxmVcpu, HaxmVm>(
2309 cfg,
2310 components,
2311 vm,
2312 WindowsIrqChip::Userspace(irq_chip).as_mut(),
2313 Some(ioapic_host_tube),
2314 vm_evt_wrtube,
2315 vm_evt_rdtube,
2316 )
2317 }
2318 #[cfg(feature = "whpx")]
2319 HypervisorKind::Whpx => {
2320 let apic_emulation_supported =
2321 Whpx::check_whpx_feature(WhpxFeature::LocalApicEmulation)
2322 .exit_context(Exit::WhpxSetupError, "failed to set up whpx")?;
2323
2324 let no_smt = cfg.no_smt;
2325
2326 // Default to WhpxSplitIrqChip if it's supported because it's more performant
2327 let irq_chip = cfg.irq_chip.unwrap_or(if apic_emulation_supported {
2328 IrqChipKind::Split
2329 } else {
2330 IrqChipKind::Userspace
2331 });
2332
2333 // Both WHPX irq chips use a userspace IOAPIC
2334 let (ioapic_host_tube, ioapic_device_tube) =
2335 Tube::pair().exit_context(Exit::CreateTube, "failed to create tube")?;
2336
2337 info!("Creating Whpx");
2338 let whpx = Whpx::new()?;
2339 let guest_mem = create_guest_memory(&components, &whpx)?;
2340 let vm = create_whpx_vm(
2341 whpx,
2342 guest_mem,
2343 components.vcpu_count,
2344 no_smt,
2345 apic_emulation_supported && irq_chip == IrqChipKind::Split,
2346 cfg.force_calibrated_tsc_leaf,
2347 vm_evt_wrtube
2348 .try_clone()
2349 .expect("could not clone vm_evt_wrtube"),
2350 )?;
2351
2352 let mut irq_chip = match irq_chip {
2353 IrqChipKind::Kernel => unimplemented!("Kernel irqchip mode not supported by WHPX"),
2354 IrqChipKind::Split => {
2355 if !apic_emulation_supported {
2356 panic!(
2357 "split irqchip specified but your WHPX version does not support \
2358 local apic emulation"
2359 );
2360 }
2361 WindowsIrqChip::WhpxSplit(create_whpx_split_irq_chip(&vm, ioapic_device_tube)?)
2362 }
2363 IrqChipKind::Userspace => {
2364 WindowsIrqChip::Userspace(create_userspace_irq_chip::<WhpxVcpu>(
2365 components.vcpu_count,
2366 ioapic_device_tube,
2367 )?)
2368 }
2369 };
2370 run_vm::<WhpxVcpu, WhpxVm>(
2371 cfg,
2372 components,
2373 vm,
2374 irq_chip.as_mut(),
2375 Some(ioapic_host_tube),
2376 vm_evt_wrtube,
2377 vm_evt_rdtube,
2378 )
2379 }
2380 #[cfg(feature = "gvm")]
2381 HypervisorKind::Gvm => {
2382 info!("Creating GVM");
2383 let gvm = Gvm::new()?;
2384 let guest_mem = create_guest_memory(&components, &gvm)?;
2385 let vm = create_gvm_vm(gvm, guest_mem)?;
2386 let ioapic_host_tube;
2387 let mut irq_chip = match cfg.irq_chip.unwrap_or(IrqChipKind::Kernel) {
2388 IrqChipKind::Split => unimplemented!("Split irqchip mode not supported by GVM"),
2389 IrqChipKind::Kernel => {
2390 ioapic_host_tube = None;
2391 WindowsIrqChip::Gvm(create_gvm_irq_chip(&vm, components.vcpu_count)?)
2392 }
2393 IrqChipKind::Userspace => {
2394 let (host_tube, ioapic_device_tube) =
2395 Tube::pair().exit_context(Exit::CreateTube, "failed to create tube")?;
2396 ioapic_host_tube = Some(host_tube);
2397 WindowsIrqChip::Userspace(create_userspace_irq_chip::<GvmVcpu>(
2398 components.vcpu_count,
2399 ioapic_device_tube,
2400 )?)
2401 }
2402 };
2403 run_vm::<GvmVcpu, GvmVm>(
2404 cfg,
2405 components,
2406 vm,
2407 irq_chip.as_mut(),
2408 ioapic_host_tube,
2409 vm_evt_wrtube,
2410 vm_evt_rdtube,
2411 )
2412 }
2413 }
2414 }
2415
2416 #[cfg(any(feature = "haxm", feature = "gvm", feature = "whpx"))]
run_vm<Vcpu, V>( #[allow(unused_mut)] mut cfg: Config, #[allow(unused_mut)] mut components: VmComponents, mut vm: V, irq_chip: &mut dyn IrqChipArch, ioapic_host_tube: Option<Tube>, vm_evt_wrtube: SendTube, vm_evt_rdtube: RecvTube, ) -> Result<ExitState> where Vcpu: VcpuArch + 'static, V: VmArch + 'static,2417 fn run_vm<Vcpu, V>(
2418 #[allow(unused_mut)] mut cfg: Config,
2419 #[allow(unused_mut)] mut components: VmComponents,
2420 mut vm: V,
2421 irq_chip: &mut dyn IrqChipArch,
2422 ioapic_host_tube: Option<Tube>,
2423 vm_evt_wrtube: SendTube,
2424 vm_evt_rdtube: RecvTube,
2425 ) -> Result<ExitState>
2426 where
2427 Vcpu: VcpuArch + 'static,
2428 V: VmArch + 'static,
2429 {
2430 let vm_memory_size_mb = components.memory_size / (1024 * 1024);
2431 let mut control_tubes = Vec::new();
2432 let mut irq_control_tubes = Vec::new();
2433 let mut vm_memory_control_tubes = Vec::new();
2434 // Create one control tube per disk.
2435 let mut disk_device_tubes = Vec::new();
2436 let mut disk_host_tubes = Vec::new();
2437 let disk_count = cfg.disks.len();
2438 for _ in 0..disk_count {
2439 let (disk_host_tube, disk_device_tube) =
2440 Tube::pair().exit_context(Exit::CreateTube, "failed to create tube")?;
2441 disk_host_tubes.push(disk_host_tube);
2442 disk_device_tubes.push(disk_device_tube);
2443 }
2444
2445 if let Some(ioapic_host_tube) = ioapic_host_tube {
2446 irq_control_tubes.push(ioapic_host_tube);
2447 }
2448
2449 // Balloon gets a special socket so balloon requests can be forwarded from the main process.
2450 let (balloon_host_tube, balloon_device_tube) = if cfg.balloon {
2451 let (balloon_host_tube, balloon_device_tube) =
2452 Tube::pair().exit_context(Exit::CreateTube, "failed to create tube")?;
2453 (Some(balloon_host_tube), Some(balloon_device_tube))
2454 } else {
2455 (None, None)
2456 };
2457 // The balloon device also needs a tube to communicate back to the main process to
2458 // handle remapping memory dynamically.
2459 let dynamic_mapping_device_tube = if cfg.balloon {
2460 let (dynamic_mapping_host_tube, dynamic_mapping_device_tube) =
2461 Tube::pair().exit_context(Exit::CreateTube, "failed to create tube")?;
2462 vm_memory_control_tubes.push(dynamic_mapping_host_tube);
2463 Some(dynamic_mapping_device_tube)
2464 } else {
2465 None
2466 };
2467
2468 // PvClock gets a tube for handling suspend/resume requests from the main thread.
2469 let (pvclock_host_tube, pvclock_device_tube) = if cfg.pvclock {
2470 let (host, device) =
2471 Tube::pair().exit_context(Exit::CreateTube, "failed to create tube")?;
2472 (Some(host), Some(device))
2473 } else {
2474 (None, None)
2475 };
2476
2477 let gralloc = RutabagaGralloc::new(RutabagaGrallocBackendFlags::new())
2478 .exit_context(Exit::CreateGralloc, "failed to create gralloc")?;
2479
2480 let pstore_size = components.pstore.as_ref().map(|pstore| pstore.size as u64);
2481 let mut sys_allocator = SystemAllocator::new(
2482 Arch::get_system_allocator_config(&vm),
2483 pstore_size,
2484 &cfg.mmio_address_ranges,
2485 )
2486 .context("failed to create system allocator")?;
2487
2488 // Allocate the ramoops region first.
2489 let ramoops_region = match &components.pstore {
2490 Some(pstore) => Some(
2491 arch::pstore::create_memory_region(
2492 &mut vm,
2493 sys_allocator.reserved_region().unwrap(),
2494 pstore,
2495 )
2496 .exit_context(
2497 Exit::Pstore,
2498 format!("failed to allocate pstore region {:?}", &components.pstore),
2499 )?,
2500 ),
2501 None => None,
2502 };
2503
2504 let init_balloon_size = components
2505 .memory_size
2506 .checked_sub(cfg.init_memory.map_or(components.memory_size, |m| {
2507 m.checked_mul(1024 * 1024).unwrap_or(u64::MAX)
2508 }))
2509 .context("failed to calculate init balloon size")?;
2510
2511 let tsc_state = devices::tsc::tsc_state().exit_code(Exit::TscCalibrationFailed)?;
2512 let tsc_sync_mitigations = get_tsc_sync_mitigations(&tsc_state, components.vcpu_count);
2513
2514 if tsc_state.core_grouping.size() > 1 {
2515 // Host TSCs are not in sync, log a metric about it.
2516 warn!(
2517 "Host TSCs are not in sync, applying the following mitigations: {:?}",
2518 tsc_sync_mitigations
2519 );
2520 log_descriptor(
2521 MetricEventType::TscCoresOutOfSync,
2522 // casting u64 as i64 is a no-op, so we don't lose any part of the bitmask
2523 tsc_state.core_grouping.core_grouping_bitmask() as i64,
2524 );
2525 }
2526
2527 #[cfg(feature = "gpu")]
2528 let gpu_control_tube = cfg
2529 .gpu_vmm_config
2530 .as_mut()
2531 .and_then(|config| config.gpu_control_host_tube.take());
2532 let product_args = product::get_run_control_args(&mut cfg);
2533
2534 // We open these files before lowering the token, as in the future a stricter policy may
2535 // prevent it.
2536 let dt_overlays = cfg
2537 .device_tree_overlay
2538 .iter()
2539 .map(|o| {
2540 Ok(DtbOverlay {
2541 file: open_file_or_duplicate(o.path.as_path(), OpenOptions::new().read(true))
2542 .with_context(|| {
2543 format!("failed to open device tree overlay {}", o.path.display())
2544 })?,
2545 })
2546 })
2547 .collect::<Result<Vec<DtbOverlay>>>()?;
2548
2549 // Lower the token, locking the main process down to a stricter security policy.
2550 //
2551 // WARNING:
2552 //
2553 // Windows system calls can behave in unusual ways if they happen concurrently to the token
2554 // lowering. For example, access denied can happen if Tube pairs are created in another thread
2555 // (b/281108137), and lower_token happens right before the client pipe is connected. Tubes are
2556 // not privileged resources, but can be broken due to the token changing unexpectedly.
2557 //
2558 // We explicitly lower the token here and *then* call run_control to make it clear that any
2559 // resources that require a privileged token should be created on the main thread & passed into
2560 // run_control, to follow the correct order:
2561 // - Privileged resources are created.
2562 // - Token is lowered.
2563 // - Threads are spawned & may create more non-privileged resources (without fear of the token
2564 // changing at an undefined time).
2565 //
2566 // Recommendation: If you find your code doesnt work in run_control because of the sandbox, you
2567 // should split any resource creation to before this token lowering & pass the resources into
2568 // run_control. Don't move the token lowering somewhere else without considering multi-threaded
2569 // effects.
2570 #[cfg(feature = "sandbox")]
2571 if sandbox::is_sandbox_target() {
2572 sandbox::TargetServices::get()
2573 .exit_code_from_err("failed to create sandbox")?
2574 .expect("Could not create sandbox!")
2575 .lower_token();
2576 }
2577
2578 let virtio_snd_state_device_tube = create_snd_state_tube(&mut control_tubes)?;
2579
2580 let (virtio_snd_host_mute_tube, virtio_snd_device_mute_tube) = create_snd_mute_tube_pair()?;
2581
2582 let pci_devices = create_devices(
2583 &mut cfg,
2584 vm.get_memory(),
2585 &vm_evt_wrtube,
2586 &mut irq_control_tubes,
2587 &mut vm_memory_control_tubes,
2588 &mut control_tubes,
2589 &mut disk_device_tubes,
2590 balloon_device_tube,
2591 pvclock_device_tube,
2592 dynamic_mapping_device_tube,
2593 /* inflate_tube= */ None,
2594 init_balloon_size,
2595 tsc_state.frequency,
2596 virtio_snd_state_device_tube,
2597 virtio_snd_device_mute_tube,
2598 )?;
2599
2600 let mut vcpu_ids = Vec::new();
2601
2602 let windows = Arch::build_vm::<V, Vcpu>(
2603 components,
2604 &vm_evt_wrtube,
2605 &mut sys_allocator,
2606 &cfg.serial_parameters,
2607 None,
2608 (cfg.battery_config.as_ref().map(|t| t.type_), None),
2609 vm,
2610 ramoops_region,
2611 pci_devices,
2612 irq_chip,
2613 &mut vcpu_ids,
2614 cfg.dump_device_tree_blob.clone(),
2615 /* debugcon_jail= */ None,
2616 None,
2617 None,
2618 dt_overlays,
2619 )
2620 .exit_context(Exit::BuildVm, "the architecture failed to build the vm")?;
2621
2622 #[cfg(feature = "stats")]
2623 let stats = if cfg.exit_stats {
2624 Some(Arc::new(Mutex::new(StatisticsCollector::new())))
2625 } else {
2626 None
2627 };
2628
2629 run_control(
2630 windows,
2631 sys_allocator,
2632 control_tubes,
2633 irq_control_tubes,
2634 vm_memory_control_tubes,
2635 vm_evt_rdtube,
2636 vm_evt_wrtube,
2637 #[cfg(feature = "gpu")]
2638 gpu_control_tube,
2639 cfg.broker_shutdown_event.take(),
2640 balloon_host_tube,
2641 pvclock_host_tube,
2642 disk_host_tubes,
2643 gralloc,
2644 #[cfg(feature = "stats")]
2645 stats,
2646 cfg.service_pipe_name,
2647 vm_memory_size_mb,
2648 cfg.host_cpu_topology,
2649 tsc_sync_mitigations,
2650 cfg.force_calibrated_tsc_leaf,
2651 product_args,
2652 virtio_snd_host_mute_tube,
2653 cfg.restore_path,
2654 cfg.socket_path,
2655 cfg.force_s2idle,
2656 cfg.suspended,
2657 )
2658 }
2659
2660 #[cfg(test)]
2661 mod tests {
2662 use tempfile::TempDir;
2663
2664 use super::*;
2665
create_config(test_dir: &TempDir) -> Config2666 fn create_config(test_dir: &TempDir) -> Config {
2667 let mut config = Config::default();
2668
2669 let dummy_kernel_path = test_dir.path().join("dummy_kernel.txt");
2670 OpenOptions::new()
2671 .create(true)
2672 .write(true)
2673 .open(&dummy_kernel_path)
2674 .expect("Could not open file!");
2675 config.executable_path = Some(Executable::Kernel(dummy_kernel_path));
2676
2677 config
2678 }
2679
2680 #[test]
2681 #[should_panic(expected = "Did not receive a bios or kernel")]
setup_vm_components_panics_when_no_kernel_provided()2682 fn setup_vm_components_panics_when_no_kernel_provided() {
2683 let mut config =
2684 create_config(&TempDir::new().expect("Could not create temporary directory!"));
2685 config.executable_path = None;
2686 let _ = setup_vm_components(&config);
2687 }
2688
2689 #[test]
setup_vm_components_stores_memory_in_bytes()2690 fn setup_vm_components_stores_memory_in_bytes() {
2691 let tempdir = TempDir::new().expect("Could not create temporary directory!");
2692 let mut config = create_config(&tempdir);
2693 config.memory = Some(1);
2694 let vm_components = setup_vm_components(&config).expect("failed to setup vm components");
2695 assert_eq!(vm_components.memory_size, 1024 * 1024);
2696 }
2697
2698 #[test]
setup_vm_components_fails_when_memory_too_large()2699 fn setup_vm_components_fails_when_memory_too_large() {
2700 let tempdir = TempDir::new().expect("Could not create temporary directory!");
2701 let mut config = create_config(&tempdir);
2702 // One mb more than a u64 can hold in bytes
2703 config.memory = Some((u64::MAX / 1024 / 1024) + 1);
2704 setup_vm_components(&config).err().expect("expected error");
2705 }
2706 }
2707