1 // Copyright 2022 The ChromiumOS Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 // TODO(b:240716507): There is huge chunk for code which depends on haxm, whpx or gvm to be enabled
6 // but isn't marked so. Remove this when we do so.
7 #![allow(dead_code, unused_imports, unused_variables, unreachable_code)]
8
9 pub(crate) mod control_server;
10 pub(crate) mod irq_wait;
11 pub(crate) mod main;
12 #[cfg(not(feature = "crash-report"))]
13 mod panic_hook;
14
15 mod generic;
16 use generic as product;
17 pub(crate) mod run_vcpu;
18
19 #[cfg(feature = "whpx")]
20 use std::arch::x86_64::__cpuid;
21 #[cfg(feature = "whpx")]
22 use std::arch::x86_64::__cpuid_count;
23 use std::cmp::Reverse;
24 use std::collections::BTreeMap;
25 use std::collections::HashMap;
26 use std::fs::File;
27 use std::fs::OpenOptions;
28 use std::io::stdin;
29 use std::iter;
30 use std::mem;
31 use std::os::windows::fs::OpenOptionsExt;
32 use std::path::PathBuf;
33 use std::sync::mpsc;
34 use std::sync::Arc;
35
36 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
37 use aarch64::AArch64 as Arch;
38 use acpi_tables::sdt::SDT;
39 use anyhow::anyhow;
40 use anyhow::bail;
41 use anyhow::Context;
42 use anyhow::Result;
43 use arch::CpuConfigArch;
44 use arch::DtbOverlay;
45 use arch::IrqChipArch;
46 use arch::LinuxArch;
47 use arch::RunnableLinuxVm;
48 use arch::VcpuArch;
49 use arch::VirtioDeviceStub;
50 use arch::VmArch;
51 use arch::VmComponents;
52 use arch::VmImage;
53 use base::enable_high_res_timers;
54 use base::error;
55 use base::info;
56 use base::open_file_or_duplicate;
57 use base::warn;
58 use base::AsRawDescriptor;
59 #[cfg(feature = "gpu")]
60 use base::BlockingMode;
61 use base::CloseNotifier;
62 use base::Event;
63 use base::EventToken;
64 use base::EventType;
65 use base::FlushOnDropTube;
66 #[cfg(feature = "gpu")]
67 use base::FramingMode;
68 use base::FromRawDescriptor;
69 use base::ProtoTube;
70 use base::RawDescriptor;
71 use base::ReadNotifier;
72 use base::RecvTube;
73 use base::SendTube;
74 #[cfg(feature = "gpu")]
75 use base::StreamChannel;
76 use base::Terminal;
77 use base::TriggeredEvent;
78 use base::Tube;
79 use base::TubeError;
80 use base::VmEventType;
81 use base::WaitContext;
82 use broker_ipc::common_child_setup;
83 use broker_ipc::CommonChildStartupArgs;
84 use control_server::ControlServer;
85 use crosvm_cli::sys::windows::exit::Exit;
86 use crosvm_cli::sys::windows::exit::ExitContext;
87 use crosvm_cli::sys::windows::exit::ExitContextAnyhow;
88 use crosvm_cli::sys::windows::exit::ExitContextOption;
89 use devices::create_devices_worker_thread;
90 use devices::serial_device::SerialHardware;
91 use devices::serial_device::SerialParameters;
92 use devices::tsc::get_tsc_sync_mitigations;
93 use devices::tsc::standard_deviation;
94 use devices::tsc::TscSyncMitigations;
95 use devices::virtio;
96 use devices::virtio::block::DiskOption;
97 #[cfg(feature = "audio")]
98 use devices::virtio::snd::common_backend::VirtioSnd;
99 #[cfg(feature = "audio")]
100 use devices::virtio::snd::parameters::Parameters as SndParameters;
101 #[cfg(feature = "gpu")]
102 use devices::virtio::vhost::user::device::gpu::sys::windows::GpuVmmConfig;
103 #[cfg(feature = "gpu")]
104 use devices::virtio::vhost::user::device::gpu::sys::windows::InputEventSplitConfig;
105 #[cfg(feature = "gpu")]
106 use devices::virtio::vhost::user::device::gpu::sys::windows::InputEventVmmConfig;
107 #[cfg(feature = "gpu")]
108 use devices::virtio::vhost::user::gpu::sys::windows::product::GpuBackendConfig as GpuBackendConfigProduct;
109 #[cfg(feature = "gpu")]
110 use devices::virtio::vhost::user::gpu::sys::windows::run_gpu_device_worker;
111 #[cfg(feature = "audio")]
112 use devices::virtio::vhost::user::snd::sys::windows::product::SndBackendConfig as SndBackendConfigProduct;
113 #[cfg(feature = "audio")]
114 use devices::virtio::vhost::user::snd::sys::windows::run_snd_device_worker;
115 #[cfg(feature = "audio")]
116 use devices::virtio::vhost::user::snd::sys::windows::SndSplitConfig;
117 #[cfg(feature = "balloon")]
118 use devices::virtio::BalloonFeatures;
119 use devices::virtio::Console;
120 #[cfg(feature = "gpu")]
121 use devices::virtio::GpuParameters;
122 use devices::BusDeviceObj;
123 use devices::BusResumeDevice;
124 #[cfg(feature = "gvm")]
125 use devices::GvmIrqChip;
126 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
127 use devices::IrqChip;
128 use devices::UserspaceIrqChip;
129 use devices::VcpuRunState;
130 use devices::VirtioPciDevice;
131 #[cfg(feature = "whpx")]
132 use devices::WhpxSplitIrqChip;
133 #[cfg(feature = "gpu")]
134 use gpu_display::EventDevice;
135 #[cfg(feature = "gpu")]
136 use gpu_display::WindowProcedureThread;
137 #[cfg(feature = "gpu")]
138 use gpu_display::WindowProcedureThreadBuilder;
139 #[cfg(feature = "gvm")]
140 use hypervisor::gvm::Gvm;
141 #[cfg(feature = "gvm")]
142 use hypervisor::gvm::GvmVcpu;
143 #[cfg(feature = "gvm")]
144 use hypervisor::gvm::GvmVersion;
145 #[cfg(feature = "gvm")]
146 use hypervisor::gvm::GvmVm;
147 #[cfg(feature = "haxm")]
148 use hypervisor::haxm::get_use_ghaxm;
149 #[cfg(feature = "haxm")]
150 use hypervisor::haxm::set_use_ghaxm;
151 #[cfg(feature = "haxm")]
152 use hypervisor::haxm::Haxm;
153 #[cfg(feature = "haxm")]
154 use hypervisor::haxm::HaxmVcpu;
155 #[cfg(feature = "haxm")]
156 use hypervisor::haxm::HaxmVm;
157 #[cfg(feature = "whpx")]
158 use hypervisor::whpx::Whpx;
159 #[cfg(feature = "whpx")]
160 use hypervisor::whpx::WhpxFeature;
161 #[cfg(feature = "whpx")]
162 use hypervisor::whpx::WhpxVcpu;
163 #[cfg(feature = "whpx")]
164 use hypervisor::whpx::WhpxVm;
165 use hypervisor::Hypervisor;
166 #[cfg(feature = "whpx")]
167 use hypervisor::HypervisorCap;
168 #[cfg(feature = "whpx")]
169 use hypervisor::HypervisorX86_64;
170 use hypervisor::ProtectionType;
171 use hypervisor::Vm;
172 use irq_wait::IrqWaitWorker;
173 use jail::FakeMinijailStub as Minijail;
174 #[cfg(not(feature = "crash-report"))]
175 pub(crate) use panic_hook::set_panic_hook;
176 use product::create_snd_mute_tube_pair;
177 #[cfg(any(feature = "haxm", feature = "gvm", feature = "whpx"))]
178 use product::create_snd_state_tube;
179 #[cfg(feature = "pvclock")]
180 use product::handle_pvclock_request;
181 use product::merge_session_invariants;
182 use product::run_ime_thread;
183 use product::set_package_name;
184 pub(crate) use product::setup_metrics_reporting;
185 use product::start_service_ipc_listener;
186 use product::RunControlArgs;
187 use product::ServiceVmState;
188 use product::Token;
189 use resources::SystemAllocator;
190 use run_vcpu::run_all_vcpus;
191 use run_vcpu::VcpuRunMode;
192 use rutabaga_gfx::RutabagaGralloc;
193 use rutabaga_gfx::RutabagaGrallocBackendFlags;
194 use smallvec::SmallVec;
195 use sync::Mutex;
196 use tube_transporter::TubeToken;
197 use tube_transporter::TubeTransporterReader;
198 use vm_control::api::VmMemoryClient;
199 #[cfg(feature = "balloon")]
200 use vm_control::BalloonControlCommand;
201 #[cfg(feature = "balloon")]
202 use vm_control::BalloonTube;
203 use vm_control::DeviceControlCommand;
204 use vm_control::InitialAudioSessionState;
205 use vm_control::IrqHandlerRequest;
206 use vm_control::PvClockCommand;
207 use vm_control::VcpuControl;
208 use vm_control::VmMemoryRegionState;
209 use vm_control::VmMemoryRequest;
210 use vm_control::VmRequest;
211 use vm_control::VmResponse;
212 use vm_control::VmRunMode;
213 use vm_memory::GuestAddress;
214 use vm_memory::GuestMemory;
215 use vmm_vhost::Connection;
216 use vmm_vhost::FrontendReq;
217 use win_util::ProcessType;
218 #[cfg(feature = "whpx")]
219 use x86_64::cpuid::adjust_cpuid;
220 #[cfg(feature = "whpx")]
221 use x86_64::cpuid::CpuIdContext;
222 #[cfg(target_arch = "x86_64")]
223 use x86_64::X8664arch as Arch;
224
225 use crate::crosvm::config::Config;
226 use crate::crosvm::config::Executable;
227 use crate::crosvm::config::InputDeviceOption;
228 #[cfg(any(feature = "gvm", feature = "whpx"))]
229 use crate::crosvm::config::IrqChipKind;
230 #[cfg(feature = "gpu")]
231 use crate::crosvm::config::TouchDeviceOption;
232 use crate::crosvm::config::DEFAULT_TOUCH_DEVICE_HEIGHT;
233 use crate::crosvm::config::DEFAULT_TOUCH_DEVICE_WIDTH;
234 use crate::crosvm::sys::config::HypervisorKind;
235 use crate::crosvm::sys::windows::broker::BrokerTubes;
236 #[cfg(feature = "stats")]
237 use crate::crosvm::sys::windows::stats::StatisticsCollector;
238 #[cfg(feature = "gpu")]
239 pub(crate) use crate::sys::windows::product::get_gpu_product_configs;
240 #[cfg(feature = "audio")]
241 pub(crate) use crate::sys::windows::product::get_snd_product_configs;
242 #[cfg(feature = "gpu")]
243 pub(crate) use crate::sys::windows::product::get_window_procedure_thread_product_configs;
244 use crate::sys::windows::product::log_descriptor;
245 #[cfg(feature = "audio")]
246 pub(crate) use crate::sys::windows::product::num_input_sound_devices;
247 #[cfg(feature = "audio")]
248 pub(crate) use crate::sys::windows::product::num_input_sound_streams;
249 use crate::sys::windows::product::spawn_anti_tamper_thread;
250 use crate::sys::windows::product::MetricEventType;
251
252 const DEFAULT_GUEST_CID: u64 = 3;
253
254 // by default, if enabled, the balloon WS features will use 4 bins.
255 const VIRTIO_BALLOON_WS_DEFAULT_NUM_BINS: u8 = 4;
256
257 enum TaggedControlTube {
258 Vm(FlushOnDropTube),
259 Product(product::TaggedControlTube),
260 }
261
262 impl ReadNotifier for TaggedControlTube {
get_read_notifier(&self) -> &dyn AsRawDescriptor263 fn get_read_notifier(&self) -> &dyn AsRawDescriptor {
264 match self {
265 Self::Vm(tube) => tube.0.get_read_notifier(),
266 Self::Product(tube) => tube.get_read_notifier(),
267 }
268 }
269 }
270
271 impl CloseNotifier for TaggedControlTube {
get_close_notifier(&self) -> &dyn AsRawDescriptor272 fn get_close_notifier(&self) -> &dyn AsRawDescriptor {
273 match self {
274 Self::Vm(tube) => tube.0.get_close_notifier(),
275 Self::Product(tube) => tube.get_close_notifier(),
276 }
277 }
278 }
279
280 pub enum ExitState {
281 Reset,
282 Stop,
283 Crash,
284 #[allow(dead_code)]
285 GuestPanic,
286 WatchdogReset,
287 }
288
289 type DeviceResult<T = VirtioDeviceStub> = Result<T>;
290
create_vhost_user_block_device( cfg: &Config, connection: Connection<FrontendReq>, ) -> DeviceResult291 fn create_vhost_user_block_device(
292 cfg: &Config,
293 connection: Connection<FrontendReq>,
294 ) -> DeviceResult {
295 let dev = virtio::VhostUserFrontend::new(
296 virtio::DeviceType::Block,
297 virtio::base_features(cfg.protection_type),
298 connection,
299 None,
300 None,
301 )
302 .exit_context(
303 Exit::VhostUserBlockDeviceNew,
304 "failed to set up vhost-user block device",
305 )?;
306
307 Ok(VirtioDeviceStub {
308 dev: Box::new(dev),
309 jail: None,
310 })
311 }
312
create_block_device(cfg: &Config, disk: &DiskOption, disk_device_tube: Tube) -> DeviceResult313 fn create_block_device(cfg: &Config, disk: &DiskOption, disk_device_tube: Tube) -> DeviceResult {
314 let features = virtio::base_features(cfg.protection_type);
315 let dev = virtio::BlockAsync::new(
316 features,
317 disk.open()?,
318 disk,
319 Some(disk_device_tube),
320 None,
321 None,
322 )
323 .exit_context(Exit::BlockDeviceNew, "failed to create block device")?;
324
325 Ok(VirtioDeviceStub {
326 dev: Box::new(dev),
327 jail: None,
328 })
329 }
330
331 #[cfg(feature = "gpu")]
create_vhost_user_gpu_device( base_features: u64, connection: Connection<FrontendReq>, ) -> DeviceResult332 fn create_vhost_user_gpu_device(
333 base_features: u64,
334 connection: Connection<FrontendReq>,
335 ) -> DeviceResult {
336 let dev = virtio::VhostUserFrontend::new(
337 virtio::DeviceType::Gpu,
338 base_features,
339 connection,
340 None,
341 None,
342 )
343 .exit_context(
344 Exit::VhostUserGpuDeviceNew,
345 "failed to set up vhost-user gpu device",
346 )?;
347
348 Ok(VirtioDeviceStub {
349 dev: Box::new(dev),
350 jail: None,
351 })
352 }
353
354 #[cfg(feature = "audio")]
create_vhost_user_snd_device( base_features: u64, connection: Connection<FrontendReq>, ) -> DeviceResult355 fn create_vhost_user_snd_device(
356 base_features: u64,
357 connection: Connection<FrontendReq>,
358 ) -> DeviceResult {
359 let dev = virtio::VhostUserFrontend::new(
360 virtio::DeviceType::Sound,
361 base_features,
362 connection,
363 None,
364 None,
365 )
366 .exit_context(
367 Exit::VhostUserSndDeviceNew,
368 "failed to set up vhost-user snd device",
369 )?;
370
371 Ok(VirtioDeviceStub {
372 dev: Box::new(dev),
373 jail: None,
374 })
375 }
376
377 #[cfg(feature = "gpu")]
create_multi_touch_device( cfg: &Config, event_pipe: StreamChannel, width: u32, height: u32, name: Option<&str>, idx: u32, ) -> DeviceResult378 fn create_multi_touch_device(
379 cfg: &Config,
380 event_pipe: StreamChannel,
381 width: u32,
382 height: u32,
383 name: Option<&str>,
384 idx: u32,
385 ) -> DeviceResult {
386 let dev = virtio::input::new_multi_touch(
387 idx,
388 event_pipe,
389 width,
390 height,
391 name,
392 virtio::base_features(cfg.protection_type),
393 )
394 .exit_context(Exit::InputDeviceNew, "failed to set up input device")?;
395 Ok(VirtioDeviceStub {
396 dev: Box::new(dev),
397 jail: None,
398 })
399 }
400
401 #[cfg(feature = "gpu")]
create_mouse_device(cfg: &Config, event_pipe: StreamChannel, idx: u32) -> DeviceResult402 fn create_mouse_device(cfg: &Config, event_pipe: StreamChannel, idx: u32) -> DeviceResult {
403 let dev = virtio::input::new_mouse(idx, event_pipe, virtio::base_features(cfg.protection_type))
404 .exit_context(Exit::InputDeviceNew, "failed to set up input device")?;
405 Ok(VirtioDeviceStub {
406 dev: Box::new(dev),
407 jail: None,
408 })
409 }
410
411 #[cfg(feature = "slirp")]
create_vhost_user_net_device(cfg: &Config, connection: Connection<FrontendReq>) -> DeviceResult412 fn create_vhost_user_net_device(cfg: &Config, connection: Connection<FrontendReq>) -> DeviceResult {
413 let features = virtio::base_features(cfg.protection_type);
414 let dev =
415 virtio::VhostUserFrontend::new(virtio::DeviceType::Net, features, connection, None, None)
416 .exit_context(
417 Exit::VhostUserNetDeviceNew,
418 "failed to set up vhost-user net device",
419 )?;
420
421 Ok(VirtioDeviceStub {
422 dev: Box::new(dev),
423 jail: None,
424 })
425 }
426
create_rng_device(cfg: &Config) -> DeviceResult427 fn create_rng_device(cfg: &Config) -> DeviceResult {
428 let dev = virtio::Rng::new(virtio::base_features(cfg.protection_type))
429 .exit_context(Exit::RngDeviceNew, "failed to set up rng")?;
430
431 Ok(VirtioDeviceStub {
432 dev: Box::new(dev),
433 jail: None,
434 })
435 }
436
create_console_device(cfg: &Config, param: &SerialParameters) -> DeviceResult437 fn create_console_device(cfg: &Config, param: &SerialParameters) -> DeviceResult {
438 let mut keep_rds = Vec::new();
439 let evt = Event::new().exit_context(Exit::CreateEvent, "failed to create event")?;
440 let dev = param
441 .create_serial_device::<Console>(cfg.protection_type, &evt, &mut keep_rds)
442 .exit_context(Exit::CreateConsole, "failed to create console device")?;
443
444 Ok(VirtioDeviceStub {
445 dev: Box::new(dev),
446 jail: None,
447 })
448 }
449
450 #[cfg(feature = "balloon")]
create_balloon_device( cfg: &Config, balloon_device_tube: Tube, dynamic_mapping_device_tube: Tube, inflate_tube: Option<Tube>, init_balloon_size: u64, ) -> DeviceResult451 fn create_balloon_device(
452 cfg: &Config,
453 balloon_device_tube: Tube,
454 dynamic_mapping_device_tube: Tube,
455 inflate_tube: Option<Tube>,
456 init_balloon_size: u64,
457 ) -> DeviceResult {
458 let balloon_features =
459 (cfg.balloon_page_reporting as u64) << BalloonFeatures::PageReporting as u64;
460 let dev = virtio::Balloon::new(
461 virtio::base_features(cfg.protection_type),
462 balloon_device_tube,
463 VmMemoryClient::new(dynamic_mapping_device_tube),
464 inflate_tube,
465 init_balloon_size,
466 balloon_features,
467 #[cfg(feature = "registered_events")]
468 None,
469 VIRTIO_BALLOON_WS_DEFAULT_NUM_BINS,
470 )
471 .exit_context(Exit::BalloonDeviceNew, "failed to create balloon")?;
472
473 Ok(VirtioDeviceStub {
474 dev: Box::new(dev),
475 jail: None,
476 })
477 }
478
create_vsock_device(cfg: &Config) -> DeviceResult479 fn create_vsock_device(cfg: &Config) -> DeviceResult {
480 // We only support a single guest, so we can confidently assign a default
481 // CID if one isn't provided. We choose the lowest non-reserved value.
482 let dev = virtio::vsock::Vsock::new(
483 cfg.vsock
484 .as_ref()
485 .map(|cfg| cfg.cid)
486 .unwrap_or(DEFAULT_GUEST_CID),
487 cfg.host_guid.clone(),
488 virtio::base_features(cfg.protection_type),
489 )
490 .exit_context(
491 Exit::UserspaceVsockDeviceNew,
492 "failed to create userspace vsock device",
493 )?;
494
495 Ok(VirtioDeviceStub {
496 dev: Box::new(dev),
497 jail: None,
498 })
499 }
500
create_virtio_devices( cfg: &mut Config, vm_evt_wrtube: &SendTube, #[allow(clippy::ptr_arg)] control_tubes: &mut Vec<TaggedControlTube>, disk_device_tubes: &mut Vec<Tube>, initial_audio_session_states: &mut Vec<InitialAudioSessionState>, balloon_device_tube: Option<Tube>, #[cfg(feature = "pvclock")] pvclock_device_tube: Option<Tube>, dynamic_mapping_device_tube: Option<Tube>, inflate_tube: Option<Tube>, init_balloon_size: u64, tsc_frequency: u64, virtio_snd_state_device_tube: Option<Tube>, virtio_snd_control_device_tube: Option<Tube>, ) -> DeviceResult<Vec<VirtioDeviceStub>>501 fn create_virtio_devices(
502 cfg: &mut Config,
503 vm_evt_wrtube: &SendTube,
504 #[allow(clippy::ptr_arg)] control_tubes: &mut Vec<TaggedControlTube>,
505 disk_device_tubes: &mut Vec<Tube>,
506 initial_audio_session_states: &mut Vec<InitialAudioSessionState>,
507 balloon_device_tube: Option<Tube>,
508 #[cfg(feature = "pvclock")] pvclock_device_tube: Option<Tube>,
509 dynamic_mapping_device_tube: Option<Tube>,
510 inflate_tube: Option<Tube>,
511 init_balloon_size: u64,
512 tsc_frequency: u64,
513 virtio_snd_state_device_tube: Option<Tube>,
514 virtio_snd_control_device_tube: Option<Tube>,
515 ) -> DeviceResult<Vec<VirtioDeviceStub>> {
516 let mut devs = Vec::new();
517
518 if cfg.block_vhost_user_tube.is_empty() {
519 // Disk devices must precede virtio-console devices or the kernel does not boot.
520 // TODO(b/171215421): figure out why this ordering is required and fix it.
521 for disk in &cfg.disks {
522 let disk_device_tube = disk_device_tubes.remove(0);
523 devs.push(create_block_device(cfg, disk, disk_device_tube)?);
524 }
525 } else {
526 info!("Starting up vhost user block backends...");
527 for _disk in &cfg.disks {
528 let disk_device_tube = cfg.block_vhost_user_tube.remove(0);
529 let connection = Connection::<FrontendReq>::from(disk_device_tube);
530 devs.push(create_vhost_user_block_device(cfg, connection)?);
531 }
532 }
533
534 for (_, param) in cfg
535 .serial_parameters
536 .iter()
537 .filter(|(_k, v)| v.hardware == SerialHardware::VirtioConsole)
538 {
539 let dev = create_console_device(cfg, param)?;
540 devs.push(dev);
541 }
542
543 #[cfg(feature = "audio")]
544 {
545 let snd_split_configs = std::mem::take(&mut cfg.snd_split_configs);
546 for mut snd_split_cfg in snd_split_configs.into_iter() {
547 devs.push(create_virtio_snd_device(
548 cfg,
549 &mut snd_split_cfg,
550 control_tubes,
551 )?);
552 if let Some(vmm_config) = snd_split_cfg.vmm_config {
553 let initial_audio_session_state = InitialAudioSessionState {
554 audio_client_guid: vmm_config.audio_client_guid,
555 card_index: vmm_config.card_index,
556 };
557 initial_audio_session_states.push(initial_audio_session_state);
558 }
559 }
560 }
561
562 #[cfg(feature = "pvclock")]
563 if let Some(tube) = pvclock_device_tube {
564 product::push_pvclock_device(cfg, &mut devs, tsc_frequency, tube);
565 }
566
567 devs.push(create_rng_device(cfg)?);
568
569 #[cfg(feature = "slirp")]
570 if let Some(net_vhost_user_tube) = cfg.net_vhost_user_tube.take() {
571 let connection = Connection::<FrontendReq>::from(net_vhost_user_tube);
572 devs.push(create_vhost_user_net_device(cfg, connection)?);
573 }
574
575 #[cfg(feature = "balloon")]
576 if let (Some(balloon_device_tube), Some(dynamic_mapping_device_tube)) =
577 (balloon_device_tube, dynamic_mapping_device_tube)
578 {
579 devs.push(create_balloon_device(
580 cfg,
581 balloon_device_tube,
582 dynamic_mapping_device_tube,
583 inflate_tube,
584 init_balloon_size,
585 )?);
586 }
587
588 devs.push(create_vsock_device(cfg)?);
589
590 #[cfg(feature = "gpu")]
591 let event_devices = if let Some(InputEventSplitConfig {
592 backend_config,
593 vmm_config,
594 }) = cfg.input_event_split_config.take()
595 {
596 devs.extend(
597 create_virtio_input_event_devices(cfg, vmm_config)
598 .context("create input event devices")?,
599 );
600 backend_config.map(|cfg| cfg.event_devices)
601 } else {
602 None
603 };
604
605 #[cfg(feature = "gpu")]
606 if let Some(wndproc_thread_vmm_config) = cfg
607 .window_procedure_thread_split_config
608 .as_mut()
609 .map(|split_cfg| &mut split_cfg.vmm_config)
610 {
611 product::push_window_procedure_thread_control_tubes(
612 control_tubes,
613 wndproc_thread_vmm_config,
614 );
615 }
616
617 #[cfg(feature = "gpu")]
618 let mut wndproc_thread = cfg
619 .window_procedure_thread_split_config
620 .as_mut()
621 .and_then(|cfg| cfg.wndproc_thread_builder.take())
622 .map(WindowProcedureThreadBuilder::start_thread)
623 .transpose()
624 .context("Failed to start the window procedure thread.")?;
625
626 #[cfg(feature = "gpu")]
627 if let Some(gpu_vmm_config) = cfg.gpu_vmm_config.take() {
628 devs.push(create_virtio_gpu_device(
629 cfg,
630 gpu_vmm_config,
631 event_devices,
632 &mut wndproc_thread,
633 control_tubes,
634 )?);
635 }
636
637 Ok(devs)
638 }
639
640 #[cfg(feature = "gpu")]
create_virtio_input_event_devices( cfg: &Config, mut input_event_vmm_config: InputEventVmmConfig, ) -> DeviceResult<Vec<VirtioDeviceStub>>641 fn create_virtio_input_event_devices(
642 cfg: &Config,
643 mut input_event_vmm_config: InputEventVmmConfig,
644 ) -> DeviceResult<Vec<VirtioDeviceStub>> {
645 let mut devs = Vec::new();
646
647 // Iterate event devices, create the VMM end.
648 let mut multi_touch_pipes = input_event_vmm_config
649 .multi_touch_pipes
650 .drain(..)
651 .enumerate();
652 for input in &cfg.virtio_input {
653 match input {
654 InputDeviceOption::SingleTouch { .. } => {
655 unimplemented!("--single-touch is no longer supported. Use --multi-touch instead.");
656 }
657 InputDeviceOption::MultiTouch {
658 width,
659 height,
660 name,
661 ..
662 } => {
663 let Some((idx, pipe)) = multi_touch_pipes.next() else {
664 break;
665 };
666 let mut width = *width;
667 let mut height = *height;
668 if idx == 0 {
669 if width.is_none() {
670 width = cfg.display_input_width;
671 }
672 if height.is_none() {
673 height = cfg.display_input_height;
674 }
675 }
676 devs.push(create_multi_touch_device(
677 cfg,
678 pipe,
679 width.unwrap_or(DEFAULT_TOUCH_DEVICE_WIDTH),
680 height.unwrap_or(DEFAULT_TOUCH_DEVICE_HEIGHT),
681 name.as_deref(),
682 idx as u32,
683 )?);
684 }
685 _ => {}
686 }
687 }
688 drop(multi_touch_pipes);
689
690 product::push_mouse_device(cfg, &mut input_event_vmm_config, &mut devs)?;
691
692 for (idx, pipe) in input_event_vmm_config.mouse_pipes.drain(..).enumerate() {
693 devs.push(create_mouse_device(cfg, pipe, idx as u32)?);
694 }
695
696 let keyboard_pipe = input_event_vmm_config
697 .keyboard_pipes
698 .pop()
699 .expect("at least one keyboard should be in GPU VMM config");
700 let dev = virtio::input::new_keyboard(
701 /* idx= */ 0,
702 keyboard_pipe,
703 virtio::base_features(cfg.protection_type),
704 )
705 .exit_context(Exit::InputDeviceNew, "failed to set up input device")?;
706
707 devs.push(VirtioDeviceStub {
708 dev: Box::new(dev),
709 jail: None,
710 });
711
712 Ok(devs)
713 }
714
715 #[cfg(feature = "gpu")]
create_virtio_gpu_device( cfg: &mut Config, mut gpu_vmm_config: GpuVmmConfig, event_devices: Option<Vec<EventDevice>>, wndproc_thread: &mut Option<WindowProcedureThread>, #[allow(clippy::ptr_arg)] control_tubes: &mut Vec<TaggedControlTube>, ) -> DeviceResult<VirtioDeviceStub>716 fn create_virtio_gpu_device(
717 cfg: &mut Config,
718 mut gpu_vmm_config: GpuVmmConfig,
719 event_devices: Option<Vec<EventDevice>>,
720 wndproc_thread: &mut Option<WindowProcedureThread>,
721 #[allow(clippy::ptr_arg)] control_tubes: &mut Vec<TaggedControlTube>,
722 ) -> DeviceResult<VirtioDeviceStub> {
723 let resource_bridges = Vec::<Tube>::new();
724
725 product::push_gpu_control_tubes(control_tubes, &mut gpu_vmm_config);
726
727 // If the GPU backend is passed, start up the vhost-user worker in the main process.
728 if let Some(backend_config) = cfg.gpu_backend_config.take() {
729 let event_devices = event_devices.ok_or_else(|| {
730 anyhow!("event devices are missing when creating virtio-gpu in the current process.")
731 })?;
732 let wndproc_thread = wndproc_thread
733 .take()
734 .ok_or_else(|| anyhow!("Window procedure thread is missing."))?;
735
736 std::thread::spawn(move || {
737 run_gpu_device_worker(backend_config, event_devices, wndproc_thread)
738 });
739 }
740
741 // The GPU is always vhost-user, even if running in the main process.
742 let gpu_device_tube = gpu_vmm_config
743 .main_vhost_user_tube
744 .take()
745 .expect("GPU VMM vhost-user tube should be set");
746 let connection = Connection::<FrontendReq>::from(gpu_device_tube);
747
748 create_vhost_user_gpu_device(virtio::base_features(cfg.protection_type), connection)
749 .context("create vhost-user GPU device")
750 }
751
752 #[cfg(feature = "audio")]
create_virtio_snd_device( cfg: &mut Config, snd_split_config: &mut SndSplitConfig, #[allow(clippy::ptr_arg)] control_tubes: &mut Vec<TaggedControlTube>, ) -> DeviceResult<VirtioDeviceStub>753 fn create_virtio_snd_device(
754 cfg: &mut Config,
755 snd_split_config: &mut SndSplitConfig,
756 #[allow(clippy::ptr_arg)] control_tubes: &mut Vec<TaggedControlTube>,
757 ) -> DeviceResult<VirtioDeviceStub> {
758 let snd_vmm_config = snd_split_config
759 .vmm_config
760 .as_mut()
761 .expect("snd_vmm_config must exist");
762 product::push_snd_control_tubes(control_tubes, snd_vmm_config);
763
764 // If the SND backend is passed, start up the vhost-user worker in the main process.
765 if let Some(backend_config) = snd_split_config.backend_config.take() {
766 std::thread::spawn(move || run_snd_device_worker(backend_config));
767 }
768
769 // The SND is always vhost-user, even if running in the main process.
770 let snd_device_tube = snd_vmm_config
771 .main_vhost_user_tube
772 .take()
773 .expect("Snd VMM vhost-user tube should be set");
774 let connection = Connection::<FrontendReq>::from(snd_device_tube);
775
776 create_vhost_user_snd_device(virtio::base_features(cfg.protection_type), connection)
777 .context("create vhost-user SND device")
778 }
779
create_devices( cfg: &mut Config, mem: &GuestMemory, exit_evt_wrtube: &SendTube, irq_control_tubes: &mut Vec<Tube>, vm_memory_control_tubes: &mut Vec<Tube>, control_tubes: &mut Vec<TaggedControlTube>, disk_device_tubes: &mut Vec<Tube>, initial_audio_session_states: &mut Vec<InitialAudioSessionState>, balloon_device_tube: Option<Tube>, #[cfg(feature = "pvclock")] pvclock_device_tube: Option<Tube>, dynamic_mapping_device_tube: Option<Tube>, inflate_tube: Option<Tube>, init_balloon_size: u64, tsc_frequency: u64, virtio_snd_state_device_tube: Option<Tube>, virtio_snd_control_device_tube: Option<Tube>, ) -> DeviceResult<Vec<(Box<dyn BusDeviceObj>, Option<Minijail>)>>780 fn create_devices(
781 cfg: &mut Config,
782 mem: &GuestMemory,
783 exit_evt_wrtube: &SendTube,
784 irq_control_tubes: &mut Vec<Tube>,
785 vm_memory_control_tubes: &mut Vec<Tube>,
786 control_tubes: &mut Vec<TaggedControlTube>,
787 disk_device_tubes: &mut Vec<Tube>,
788 initial_audio_session_states: &mut Vec<InitialAudioSessionState>,
789 balloon_device_tube: Option<Tube>,
790 #[cfg(feature = "pvclock")] pvclock_device_tube: Option<Tube>,
791 dynamic_mapping_device_tube: Option<Tube>,
792 inflate_tube: Option<Tube>,
793 init_balloon_size: u64,
794 tsc_frequency: u64,
795 virtio_snd_state_device_tube: Option<Tube>,
796 virtio_snd_control_device_tube: Option<Tube>,
797 ) -> DeviceResult<Vec<(Box<dyn BusDeviceObj>, Option<Minijail>)>> {
798 let stubs = create_virtio_devices(
799 cfg,
800 exit_evt_wrtube,
801 control_tubes,
802 disk_device_tubes,
803 initial_audio_session_states,
804 balloon_device_tube,
805 #[cfg(feature = "pvclock")]
806 pvclock_device_tube,
807 dynamic_mapping_device_tube,
808 inflate_tube,
809 init_balloon_size,
810 tsc_frequency,
811 virtio_snd_state_device_tube,
812 virtio_snd_control_device_tube,
813 )?;
814
815 let mut pci_devices = Vec::new();
816
817 for stub in stubs {
818 let (msi_host_tube, msi_device_tube) =
819 Tube::pair().exit_context(Exit::CreateTube, "failed to create tube")?;
820 irq_control_tubes.push(msi_host_tube);
821
822 let shared_memory_tube = if stub.dev.get_shared_memory_region().is_some() {
823 let (host_tube, device_tube) =
824 Tube::pair().context("failed to create VVU proxy tube")?;
825 vm_memory_control_tubes.push(host_tube);
826 Some(device_tube)
827 } else {
828 None
829 };
830
831 let (ioevent_host_tube, ioevent_device_tube) =
832 Tube::pair().context("failed to create ioevent tube")?;
833 vm_memory_control_tubes.push(ioevent_host_tube);
834
835 let (vm_control_host_tube, vm_control_device_tube) =
836 Tube::pair().context("failed to create vm_control tube")?;
837 control_tubes.push(TaggedControlTube::Vm(FlushOnDropTube::from(
838 vm_control_host_tube,
839 )));
840
841 let dev = Box::new(
842 VirtioPciDevice::new(
843 mem.clone(),
844 stub.dev,
845 msi_device_tube,
846 cfg.disable_virtio_intx,
847 shared_memory_tube.map(VmMemoryClient::new),
848 VmMemoryClient::new(ioevent_device_tube),
849 vm_control_device_tube,
850 )
851 .exit_context(Exit::VirtioPciDev, "failed to create virtio pci dev")?,
852 ) as Box<dyn BusDeviceObj>;
853 pci_devices.push((dev, stub.jail));
854 }
855
856 Ok(pci_devices)
857 }
858
859 #[derive(Debug)]
860 struct PvClockError(String);
861
handle_readable_event<V: VmArch + 'static, Vcpu: VcpuArch + 'static>( event: &TriggeredEvent<Token>, vm_control_ids_to_remove: &mut Vec<usize>, next_control_id: &mut usize, service_vm_state: &mut ServiceVmState, disk_host_tubes: &[Tube], ipc_main_loop_tube: Option<&Tube>, #[cfg(feature = "gpu")] gpu_control_tube: Option<&Tube>, vm_evt_rdtube: &RecvTube, control_tubes: &mut BTreeMap<usize, TaggedControlTube>, guest_os: &mut RunnableLinuxVm<V, Vcpu>, sys_allocator_mutex: &Arc<Mutex<SystemAllocator>>, virtio_snd_host_mute_tubes: &mut [Tube], proto_main_loop_tube: Option<&ProtoTube>, anti_tamper_main_thread_tube: &Option<ProtoTube>, #[cfg(feature = "balloon")] mut balloon_tube: Option<&mut BalloonTube>, memory_size_mb: u64, vcpu_boxes: &Mutex<Vec<Box<dyn VcpuArch>>>, #[cfg(feature = "pvclock")] pvclock_host_tube: &Option<Tube>, run_mode_arc: &VcpuRunMode, region_state: &mut VmMemoryRegionState, vm_control_server: Option<&mut ControlServer>, irq_handler_control: &Tube, device_ctrl_tube: &Tube, wait_ctx: &WaitContext<Token>, force_s2idle: bool, vcpu_control_channels: &[mpsc::Sender<VcpuControl>], suspended_pvclock_state: &mut Option<hypervisor::ClockState>, ) -> Result<Option<ExitState>>862 fn handle_readable_event<V: VmArch + 'static, Vcpu: VcpuArch + 'static>(
863 event: &TriggeredEvent<Token>,
864 vm_control_ids_to_remove: &mut Vec<usize>,
865 next_control_id: &mut usize,
866 service_vm_state: &mut ServiceVmState,
867 disk_host_tubes: &[Tube],
868 ipc_main_loop_tube: Option<&Tube>,
869 #[cfg(feature = "gpu")] gpu_control_tube: Option<&Tube>,
870 vm_evt_rdtube: &RecvTube,
871 control_tubes: &mut BTreeMap<usize, TaggedControlTube>,
872 guest_os: &mut RunnableLinuxVm<V, Vcpu>,
873 sys_allocator_mutex: &Arc<Mutex<SystemAllocator>>,
874 virtio_snd_host_mute_tubes: &mut [Tube],
875 proto_main_loop_tube: Option<&ProtoTube>,
876 anti_tamper_main_thread_tube: &Option<ProtoTube>,
877 #[cfg(feature = "balloon")] mut balloon_tube: Option<&mut BalloonTube>,
878 memory_size_mb: u64,
879 vcpu_boxes: &Mutex<Vec<Box<dyn VcpuArch>>>,
880 #[cfg(feature = "pvclock")] pvclock_host_tube: &Option<Tube>,
881 run_mode_arc: &VcpuRunMode,
882 region_state: &mut VmMemoryRegionState,
883 vm_control_server: Option<&mut ControlServer>,
884 irq_handler_control: &Tube,
885 device_ctrl_tube: &Tube,
886 wait_ctx: &WaitContext<Token>,
887 force_s2idle: bool,
888 vcpu_control_channels: &[mpsc::Sender<VcpuControl>],
889 suspended_pvclock_state: &mut Option<hypervisor::ClockState>,
890 ) -> Result<Option<ExitState>> {
891 let mut execute_vm_request = |request: VmRequest, guest_os: &mut RunnableLinuxVm<V, Vcpu>| {
892 if let VmRequest::Exit = request {
893 return (VmResponse::Ok, Some(VmRunMode::Exiting));
894 }
895 let vcpu_size = vcpu_boxes.lock().len();
896 let resp = request.execute(
897 &guest_os.vm,
898 disk_host_tubes,
899 &[],
900 &mut guest_os.pm,
901 #[cfg(feature = "gpu")]
902 gpu_control_tube,
903 #[cfg(not(feature = "gpu"))]
904 None,
905 None,
906 &mut None,
907 |msg| {
908 kick_all_vcpus(
909 run_mode_arc,
910 vcpu_control_channels,
911 vcpu_boxes,
912 guest_os.irq_chip.as_ref(),
913 #[cfg(feature = "pvclock")]
914 pvclock_host_tube,
915 &guest_os.resume_notify_devices,
916 msg,
917 );
918 },
919 force_s2idle,
920 #[cfg(feature = "swap")]
921 None,
922 device_ctrl_tube,
923 vcpu_size,
924 irq_handler_control,
925 || guest_os.irq_chip.as_ref().snapshot(vcpu_size),
926 suspended_pvclock_state,
927 );
928 (resp, None)
929 };
930
931 match event.token {
932 Token::VmEvent => match vm_evt_rdtube.recv::<VmEventType>() {
933 Ok(vm_event) => {
934 let exit_state = match vm_event {
935 VmEventType::Exit => {
936 info!("vcpu requested shutdown");
937 Some(ExitState::Stop)
938 }
939 VmEventType::Reset => {
940 info!("vcpu requested reset");
941 Some(ExitState::Reset)
942 }
943 VmEventType::Crash => {
944 info!("vcpu crashed");
945 Some(ExitState::Crash)
946 }
947 VmEventType::Panic(_) => {
948 error!("got pvpanic event. this event is not expected on Windows.");
949 None
950 }
951 VmEventType::WatchdogReset => {
952 info!("vcpu stall detected");
953 Some(ExitState::WatchdogReset)
954 }
955 };
956 return Ok(exit_state);
957 }
958 Err(e) => {
959 warn!("failed to recv VmEvent: {}", e);
960 }
961 },
962 Token::BrokerShutdown => {
963 info!("main loop got broker shutdown event");
964 return Ok(Some(ExitState::Stop));
965 }
966 Token::VmControlServer => {
967 let server =
968 vm_control_server.expect("control server must exist if this event triggers");
969 let client = server.accept();
970 let id = *next_control_id;
971 *next_control_id += 1;
972 wait_ctx
973 .add(client.0.get_read_notifier(), Token::VmControl { id })
974 .exit_context(
975 Exit::WaitContextAdd,
976 "failed to add trigger to wait context",
977 )?;
978 wait_ctx
979 .add(client.0.get_close_notifier(), Token::VmControl { id })
980 .exit_context(
981 Exit::WaitContextAdd,
982 "failed to add trigger to wait context",
983 )?;
984 control_tubes.insert(id, TaggedControlTube::Vm(client));
985 }
986 #[allow(clippy::collapsible_match)]
987 Token::VmControl { id } => {
988 if let Some(tube) = control_tubes.get(&id) {
989 #[allow(clippy::single_match)]
990 match tube {
991 TaggedControlTube::Product(product_tube) => {
992 product::handle_tagged_control_tube_event(
993 product_tube,
994 virtio_snd_host_mute_tubes,
995 service_vm_state,
996 ipc_main_loop_tube,
997 )
998 }
999 TaggedControlTube::Vm(tube) => match tube.0.recv::<VmRequest>() {
1000 Ok(request) => {
1001 let mut run_mode_opt = None;
1002 let response = match request {
1003 VmRequest::HotPlugVfioCommand { device, add } => {
1004 // Suppress warnings.
1005 let _ = (device, add);
1006 unimplemented!("not implemented on Windows");
1007 }
1008 #[cfg(feature = "registered_events")]
1009 VmRequest::RegisterListener { socket_addr, event } => {
1010 unimplemented!("not implemented on Windows");
1011 }
1012 #[cfg(feature = "registered_events")]
1013 VmRequest::UnregisterListener { socket_addr, event } => {
1014 unimplemented!("not implemented on Windows");
1015 }
1016 #[cfg(feature = "registered_events")]
1017 VmRequest::Unregister { socket_addr } => {
1018 unimplemented!("not implemented on Windows");
1019 }
1020 #[cfg(feature = "balloon")]
1021 VmRequest::BalloonCommand(cmd) => {
1022 if let Some(balloon_tube) = balloon_tube {
1023 if let Some((r, key)) = balloon_tube.send_cmd(cmd, Some(id))
1024 {
1025 if key != id {
1026 unimplemented!("not implemented on Windows");
1027 }
1028 Some(r)
1029 } else {
1030 None
1031 }
1032 } else {
1033 error!("balloon not enabled");
1034 None
1035 }
1036 }
1037 _ => {
1038 let (resp, run_mode_ret) =
1039 execute_vm_request(request, guest_os);
1040 run_mode_opt = run_mode_ret;
1041 Some(resp)
1042 }
1043 };
1044
1045 if let Some(response) = response {
1046 if let Err(e) = tube.0.send(&response) {
1047 error!("failed to send VmResponse: {}", e);
1048 }
1049 }
1050 if let Some(exit_state) =
1051 handle_run_mode_change_for_vm_request(&run_mode_opt, guest_os)
1052 {
1053 return Ok(Some(exit_state));
1054 }
1055 }
1056 Err(e) => {
1057 if let TubeError::Disconnected = e {
1058 vm_control_ids_to_remove.push(id);
1059 } else {
1060 error!("failed to recv VmRequest: {}", e);
1061 }
1062 }
1063 },
1064 }
1065 }
1066 }
1067 #[cfg(feature = "balloon")]
1068 Token::BalloonTube => match balloon_tube.as_mut().expect("missing balloon tube").recv() {
1069 Ok(resp) => {
1070 for (resp, idx) in resp {
1071 if let Some(TaggedControlTube::Vm(tube)) = control_tubes.get(&idx) {
1072 if let Err(e) = tube.0.send(&resp) {
1073 error!("failed to send VmResponse: {}", e);
1074 }
1075 } else {
1076 error!("Bad tube index {}", idx);
1077 }
1078 }
1079 }
1080 Err(err) => {
1081 error!("Error processing balloon tube {:?}", err)
1082 }
1083 },
1084 #[cfg(not(feature = "balloon"))]
1085 Token::BalloonTube => unreachable!("balloon tube not registered"),
1086 #[allow(unreachable_patterns)]
1087 _ => {
1088 let run_mode_opt = product::handle_received_token(
1089 &event.token,
1090 anti_tamper_main_thread_tube,
1091 #[cfg(feature = "balloon")]
1092 balloon_tube,
1093 control_tubes,
1094 guest_os,
1095 ipc_main_loop_tube,
1096 memory_size_mb,
1097 proto_main_loop_tube,
1098 #[cfg(feature = "pvclock")]
1099 pvclock_host_tube,
1100 run_mode_arc,
1101 service_vm_state,
1102 vcpu_boxes,
1103 virtio_snd_host_mute_tubes,
1104 execute_vm_request,
1105 );
1106 if let Some(exit_state) = handle_run_mode_change_for_vm_request(&run_mode_opt, guest_os)
1107 {
1108 return Ok(Some(exit_state));
1109 }
1110 }
1111 };
1112 Ok(None)
1113 }
1114
1115 /// Handles a run mode change (if one occurred) if one is pending as a
1116 /// result a VmRequest. The parameter, run_mode_opt, is the run mode change
1117 /// proposed by the VmRequest's execution.
1118 ///
1119 /// Returns the exit state, if it changed due to a run mode change.
1120 /// None otherwise.
handle_run_mode_change_for_vm_request<V: VmArch + 'static, Vcpu: VcpuArch + 'static>( run_mode_opt: &Option<VmRunMode>, guest_os: &mut RunnableLinuxVm<V, Vcpu>, ) -> Option<ExitState>1121 fn handle_run_mode_change_for_vm_request<V: VmArch + 'static, Vcpu: VcpuArch + 'static>(
1122 run_mode_opt: &Option<VmRunMode>,
1123 guest_os: &mut RunnableLinuxVm<V, Vcpu>,
1124 ) -> Option<ExitState> {
1125 if let Some(run_mode) = run_mode_opt {
1126 info!("control socket changed run mode to {}", run_mode);
1127 match run_mode {
1128 VmRunMode::Exiting => return Some(ExitState::Stop),
1129 _ => unreachable!(),
1130 }
1131 }
1132 // No exit state change.
1133 None
1134 }
1135
1136 /// Commands to control the VM Memory handler thread.
1137 #[derive(serde::Serialize, serde::Deserialize)]
1138 pub enum VmMemoryHandlerRequest {
1139 /// No response is sent for this command.
1140 Exit,
1141 }
1142
vm_memory_handler_thread( control_tubes: Vec<Tube>, mut vm: impl Vm, sys_allocator_mutex: Arc<Mutex<SystemAllocator>>, mut gralloc: RutabagaGralloc, handler_control: Tube, ) -> anyhow::Result<()>1143 fn vm_memory_handler_thread(
1144 control_tubes: Vec<Tube>,
1145 mut vm: impl Vm,
1146 sys_allocator_mutex: Arc<Mutex<SystemAllocator>>,
1147 mut gralloc: RutabagaGralloc,
1148 handler_control: Tube,
1149 ) -> anyhow::Result<()> {
1150 #[derive(EventToken)]
1151 enum Token {
1152 VmControl { id: usize },
1153 HandlerControl,
1154 }
1155
1156 let wait_ctx =
1157 WaitContext::build_with(&[(handler_control.get_read_notifier(), Token::HandlerControl)])
1158 .context("failed to build wait context")?;
1159 let mut control_tubes = BTreeMap::from_iter(control_tubes.into_iter().enumerate());
1160 for (id, socket) in control_tubes.iter() {
1161 wait_ctx
1162 .add(socket.get_read_notifier(), Token::VmControl { id: *id })
1163 .context("failed to add descriptor to wait context")?;
1164 }
1165
1166 let mut region_state: VmMemoryRegionState = Default::default();
1167
1168 'wait: loop {
1169 let events = {
1170 match wait_ctx.wait() {
1171 Ok(v) => v,
1172 Err(e) => {
1173 error!("failed to poll: {}", e);
1174 break;
1175 }
1176 }
1177 };
1178
1179 let mut vm_control_ids_to_remove = Vec::new();
1180 for event in events.iter().filter(|e| e.is_readable) {
1181 match event.token {
1182 Token::HandlerControl => match handler_control.recv::<VmMemoryHandlerRequest>() {
1183 Ok(request) => match request {
1184 VmMemoryHandlerRequest::Exit => break 'wait,
1185 },
1186 Err(e) => {
1187 if let TubeError::Disconnected = e {
1188 panic!("vm memory control tube disconnected.");
1189 } else {
1190 error!("failed to recv VmMemoryHandlerRequest: {}", e);
1191 }
1192 }
1193 },
1194
1195 Token::VmControl { id } => {
1196 if let Some(tube) = control_tubes.get(&id) {
1197 match tube.recv::<VmMemoryRequest>() {
1198 Ok(request) => {
1199 let response = request.execute(
1200 &mut vm,
1201 &mut sys_allocator_mutex.lock(),
1202 &mut gralloc,
1203 None,
1204 &mut region_state,
1205 );
1206 if let Err(e) = tube.send(&response) {
1207 error!("failed to send VmMemoryControlResponse: {}", e);
1208 }
1209 }
1210 Err(e) => {
1211 if let TubeError::Disconnected = e {
1212 vm_control_ids_to_remove.push(id);
1213 } else {
1214 error!("failed to recv VmMemoryControlRequest: {}", e);
1215 }
1216 }
1217 }
1218 }
1219 }
1220 }
1221 }
1222
1223 remove_closed_tubes(&wait_ctx, &mut control_tubes, vm_control_ids_to_remove)?;
1224 if events
1225 .iter()
1226 .any(|e| e.is_hungup && !e.is_readable && matches!(e.token, Token::HandlerControl))
1227 {
1228 error!("vm memory handler control hung up but did not request an exit.");
1229 break 'wait;
1230 }
1231 }
1232 Ok(())
1233 }
1234
create_control_server( control_server_path: Option<PathBuf>, wait_ctx: &WaitContext<Token>, ) -> Result<Option<ControlServer>>1235 fn create_control_server(
1236 control_server_path: Option<PathBuf>,
1237 wait_ctx: &WaitContext<Token>,
1238 ) -> Result<Option<ControlServer>> {
1239 #[cfg(not(feature = "prod-build"))]
1240 {
1241 if let Some(path) = control_server_path {
1242 let server =
1243 ControlServer::new(path.to_str().expect("control socket path must be a string"))
1244 .exit_context(
1245 Exit::FailedToCreateControlServer,
1246 "failed to create control server",
1247 )?;
1248 wait_ctx
1249 .add(server.client_waiting(), Token::VmControlServer)
1250 .exit_context(
1251 Exit::WaitContextAdd,
1252 "failed to add control server to wait context",
1253 )?;
1254 return Ok(Some(server));
1255 }
1256 }
1257 Ok::<Option<ControlServer>, anyhow::Error>(None)
1258 }
1259
run_control<V: VmArch + 'static, Vcpu: VcpuArch + 'static>( mut guest_os: RunnableLinuxVm<V, Vcpu>, sys_allocator: SystemAllocator, control_tubes: Vec<TaggedControlTube>, irq_control_tubes: Vec<Tube>, vm_memory_control_tubes: Vec<Tube>, vm_evt_rdtube: RecvTube, vm_evt_wrtube: SendTube, #[cfg(feature = "gpu")] gpu_control_tube: Option<Tube>, broker_shutdown_evt: Option<Event>, balloon_host_tube: Option<Tube>, #[cfg(feature = "pvclock")] pvclock_host_tube: Option<Tube>, disk_host_tubes: Vec<Tube>, initial_audio_session_states: Vec<InitialAudioSessionState>, gralloc: RutabagaGralloc, #[cfg(feature = "stats")] stats: Option<Arc<Mutex<StatisticsCollector>>>, service_pipe_name: Option<String>, memory_size_mb: u64, host_cpu_topology: bool, tsc_sync_mitigations: TscSyncMitigations, force_calibrated_tsc_leaf: bool, mut product_args: RunControlArgs, mut virtio_snd_host_mute_tubes: Vec<Tube>, restore_path: Option<PathBuf>, control_server_path: Option<PathBuf>, force_s2idle: bool, suspended: bool, ) -> Result<ExitState>1260 fn run_control<V: VmArch + 'static, Vcpu: VcpuArch + 'static>(
1261 mut guest_os: RunnableLinuxVm<V, Vcpu>,
1262 sys_allocator: SystemAllocator,
1263 control_tubes: Vec<TaggedControlTube>,
1264 irq_control_tubes: Vec<Tube>,
1265 vm_memory_control_tubes: Vec<Tube>,
1266 vm_evt_rdtube: RecvTube,
1267 vm_evt_wrtube: SendTube,
1268 #[cfg(feature = "gpu")] gpu_control_tube: Option<Tube>,
1269 broker_shutdown_evt: Option<Event>,
1270 balloon_host_tube: Option<Tube>,
1271 #[cfg(feature = "pvclock")] pvclock_host_tube: Option<Tube>,
1272 disk_host_tubes: Vec<Tube>,
1273 initial_audio_session_states: Vec<InitialAudioSessionState>,
1274 gralloc: RutabagaGralloc,
1275 #[cfg(feature = "stats")] stats: Option<Arc<Mutex<StatisticsCollector>>>,
1276 service_pipe_name: Option<String>,
1277 memory_size_mb: u64,
1278 host_cpu_topology: bool,
1279 tsc_sync_mitigations: TscSyncMitigations,
1280 force_calibrated_tsc_leaf: bool,
1281 mut product_args: RunControlArgs,
1282 mut virtio_snd_host_mute_tubes: Vec<Tube>,
1283 restore_path: Option<PathBuf>,
1284 control_server_path: Option<PathBuf>,
1285 force_s2idle: bool,
1286 suspended: bool,
1287 ) -> Result<ExitState> {
1288 let (ipc_main_loop_tube, proto_main_loop_tube, _service_ipc) =
1289 start_service_ipc_listener(service_pipe_name)?;
1290
1291 let mut service_vm_state = product::create_service_vm_state(memory_size_mb);
1292
1293 let service_audio_states = product::create_service_audio_states_and_send_to_service(
1294 initial_audio_session_states,
1295 &ipc_main_loop_tube,
1296 )?;
1297
1298 let sys_allocator_mutex = Arc::new(Mutex::new(sys_allocator));
1299
1300 let exit_evt = Event::new().exit_context(Exit::CreateEvent, "failed to create event")?;
1301 let (irq_handler_control, irq_handler_control_for_worker) = Tube::pair().exit_context(
1302 Exit::CreateTube,
1303 "failed to create IRQ handler control Tube",
1304 )?;
1305
1306 // Create a separate thread to wait on IRQ events. This is a natural division
1307 // because IRQ interrupts have no dependencies on other events, and this lets
1308 // us avoid approaching the Windows WaitForMultipleObjects 64-object limit.
1309 let irq_join_handle = IrqWaitWorker::start(
1310 irq_handler_control_for_worker,
1311 guest_os
1312 .irq_chip
1313 .try_box_clone()
1314 .exit_context(Exit::CloneEvent, "failed to clone irq chip")?,
1315 irq_control_tubes,
1316 sys_allocator_mutex.clone(),
1317 );
1318
1319 let mut triggers = vec![(vm_evt_rdtube.get_read_notifier(), Token::VmEvent)];
1320 product::push_triggers(&mut triggers, &ipc_main_loop_tube, &proto_main_loop_tube);
1321 let wait_ctx = WaitContext::build_with(&triggers).exit_context(
1322 Exit::WaitContextAdd,
1323 "failed to add trigger to wait context",
1324 )?;
1325
1326 #[cfg(feature = "balloon")]
1327 let mut balloon_tube = balloon_host_tube
1328 .map(|tube| -> Result<BalloonTube> {
1329 wait_ctx
1330 .add(tube.get_read_notifier(), Token::BalloonTube)
1331 .context("failed to add trigger to wait context")?;
1332 Ok(BalloonTube::new(tube))
1333 })
1334 .transpose()
1335 .context("failed to create balloon tube")?;
1336
1337 let (vm_memory_handler_control, vm_memory_handler_control_for_thread) = Tube::pair()?;
1338 let vm_memory_handler_thread_join_handle = std::thread::Builder::new()
1339 .name("vm_memory_handler_thread".into())
1340 .spawn({
1341 let vm = guest_os.vm.try_clone().context("failed to clone Vm")?;
1342 let sys_allocator_mutex = sys_allocator_mutex.clone();
1343 move || {
1344 vm_memory_handler_thread(
1345 vm_memory_control_tubes,
1346 vm,
1347 sys_allocator_mutex,
1348 gralloc,
1349 vm_memory_handler_control_for_thread,
1350 )
1351 }
1352 })
1353 .unwrap();
1354
1355 if let Some(evt) = broker_shutdown_evt.as_ref() {
1356 wait_ctx.add(evt, Token::BrokerShutdown).exit_context(
1357 Exit::WaitContextAdd,
1358 "failed to add trigger to wait context",
1359 )?;
1360 }
1361
1362 let mut control_tubes = BTreeMap::from_iter(control_tubes.into_iter().enumerate());
1363 let mut next_control_id = control_tubes.len();
1364 for (id, control_tube) in control_tubes.iter() {
1365 #[allow(clippy::single_match)]
1366 match control_tube {
1367 TaggedControlTube::Product(product_tube) => wait_ctx
1368 .add(
1369 product_tube.get_read_notifier(),
1370 Token::VmControl { id: *id },
1371 )
1372 .exit_context(
1373 Exit::WaitContextAdd,
1374 "failed to add trigger to wait context",
1375 )?,
1376 _ => (),
1377 }
1378 }
1379
1380 let (device_ctrl_tube, device_ctrl_resp) = Tube::pair().context("failed to create tube")?;
1381 guest_os.devices_thread = match create_devices_worker_thread(
1382 guest_os.vm.get_memory().clone(),
1383 guest_os.io_bus.clone(),
1384 guest_os.mmio_bus.clone(),
1385 device_ctrl_resp,
1386 ) {
1387 Ok(join_handle) => Some(join_handle),
1388 Err(e) => {
1389 return Err(anyhow!("Failed to start devices thread: {}", e));
1390 }
1391 };
1392
1393 let vcpus: Vec<Option<_>> = match guest_os.vcpus.take() {
1394 Some(vec) => vec.into_iter().map(|vcpu| Some(vcpu)).collect(),
1395 None => iter::repeat_with(|| None)
1396 .take(guest_os.vcpu_count)
1397 .collect(),
1398 };
1399
1400 let anti_tamper_main_thread_tube = spawn_anti_tamper_thread(&wait_ctx);
1401
1402 let mut vm_control_server = create_control_server(control_server_path, &wait_ctx)?;
1403
1404 let ime_thread = run_ime_thread(&mut product_args, &exit_evt)?;
1405
1406 let original_terminal_mode = stdin().set_raw_mode().ok();
1407
1408 let vcpu_boxes: Arc<Mutex<Vec<Box<dyn VcpuArch>>>> = Arc::new(Mutex::new(Vec::new()));
1409 let run_mode_arc = Arc::new(VcpuRunMode::default());
1410
1411 let run_mode_state = if suspended {
1412 // Sleep devices before creating vcpus.
1413 device_ctrl_tube
1414 .send(&DeviceControlCommand::SleepDevices)
1415 .context("send command to devices control socket")?;
1416 match device_ctrl_tube
1417 .recv()
1418 .context("receive from devices control socket")?
1419 {
1420 VmResponse::Ok => (),
1421 resp => bail!("device sleep failed: {}", resp),
1422 }
1423 run_mode_arc.set_and_notify(VmRunMode::Suspending);
1424 VmRunMode::Suspending
1425 } else {
1426 VmRunMode::Running
1427 };
1428
1429 // If we are restoring from a snapshot, then start suspended.
1430 if restore_path.is_some() {
1431 run_mode_arc.set_and_notify(VmRunMode::Suspending);
1432 }
1433
1434 let (vcpu_threads, vcpu_control_channels) = run_all_vcpus(
1435 vcpus,
1436 vcpu_boxes.clone(),
1437 &guest_os,
1438 &exit_evt,
1439 &vm_evt_wrtube,
1440 #[cfg(feature = "stats")]
1441 &stats,
1442 host_cpu_topology,
1443 run_mode_arc.clone(),
1444 tsc_sync_mitigations,
1445 force_calibrated_tsc_leaf,
1446 )?;
1447
1448 // See comment on `VmRequest::execute`.
1449 let mut suspended_pvclock_state: Option<hypervisor::ClockState> = None;
1450
1451 // Restore VM (if applicable).
1452 if let Some(path) = restore_path {
1453 vm_control::do_restore(
1454 &path,
1455 |msg| {
1456 kick_all_vcpus(
1457 run_mode_arc.as_ref(),
1458 &vcpu_control_channels,
1459 vcpu_boxes.as_ref(),
1460 guest_os.irq_chip.as_ref(),
1461 #[cfg(feature = "pvclock")]
1462 &pvclock_host_tube,
1463 &guest_os.resume_notify_devices,
1464 msg,
1465 )
1466 },
1467 |msg, index| {
1468 kick_vcpu(
1469 run_mode_arc.as_ref(),
1470 &vcpu_control_channels,
1471 vcpu_boxes.as_ref(),
1472 guest_os.irq_chip.as_ref(),
1473 index,
1474 msg,
1475 )
1476 },
1477 &irq_handler_control,
1478 &device_ctrl_tube,
1479 guest_os.vcpu_count,
1480 |image| {
1481 guest_os
1482 .irq_chip
1483 .try_box_clone()?
1484 .restore(image, guest_os.vcpu_count)
1485 },
1486 /* require_encrypted= */ false,
1487 &mut suspended_pvclock_state,
1488 &guest_os.vm,
1489 )?;
1490 // Allow the vCPUs to start for real.
1491 kick_all_vcpus(
1492 run_mode_arc.as_ref(),
1493 &vcpu_control_channels,
1494 vcpu_boxes.as_ref(),
1495 guest_os.irq_chip.as_ref(),
1496 #[cfg(feature = "pvclock")]
1497 &pvclock_host_tube,
1498 &guest_os.resume_notify_devices,
1499 // Other platforms (unix) have multiple modes they could start in (e.g. starting for
1500 // guest kernel debugging, etc). If/when we support those modes on Windows, we'll need
1501 // to enter that mode here rather than VmRunMode::Running.
1502 VcpuControl::RunState(run_mode_state),
1503 );
1504 }
1505
1506 let mut exit_state = ExitState::Stop;
1507 let mut region_state: VmMemoryRegionState = Default::default();
1508
1509 'poll: loop {
1510 let events = {
1511 match wait_ctx.wait() {
1512 Ok(v) => v,
1513 Err(e) => {
1514 error!("failed to wait: {}", e);
1515 break;
1516 }
1517 }
1518 };
1519
1520 let mut vm_control_ids_to_remove = Vec::new();
1521 for event in events.iter().filter(|e| e.is_readable) {
1522 let state = handle_readable_event(
1523 event,
1524 &mut vm_control_ids_to_remove,
1525 &mut next_control_id,
1526 &mut service_vm_state,
1527 disk_host_tubes.as_slice(),
1528 ipc_main_loop_tube.as_ref(),
1529 #[cfg(feature = "gpu")]
1530 gpu_control_tube.as_ref(),
1531 &vm_evt_rdtube,
1532 &mut control_tubes,
1533 &mut guest_os,
1534 &sys_allocator_mutex,
1535 &mut virtio_snd_host_mute_tubes,
1536 proto_main_loop_tube.as_ref(),
1537 &anti_tamper_main_thread_tube,
1538 #[cfg(feature = "balloon")]
1539 balloon_tube.as_mut(),
1540 memory_size_mb,
1541 vcpu_boxes.as_ref(),
1542 #[cfg(feature = "pvclock")]
1543 &pvclock_host_tube,
1544 run_mode_arc.as_ref(),
1545 &mut region_state,
1546 vm_control_server.as_mut(),
1547 &irq_handler_control,
1548 &device_ctrl_tube,
1549 &wait_ctx,
1550 force_s2idle,
1551 &vcpu_control_channels,
1552 &mut suspended_pvclock_state,
1553 )?;
1554 if let Some(state) = state {
1555 exit_state = state;
1556 break 'poll;
1557 }
1558 }
1559
1560 remove_closed_tubes(&wait_ctx, &mut control_tubes, vm_control_ids_to_remove)?;
1561 }
1562
1563 info!("run_control poll loop completed, forcing vCPUs to exit...");
1564
1565 // VCPU threads MUST see the VmRunMode flag, otherwise they may re-enter the VM.
1566 run_mode_arc.set_and_notify(VmRunMode::Exiting);
1567
1568 // Force all vcpus to exit from the hypervisor
1569 for vcpu in vcpu_boxes.lock().iter() {
1570 vcpu.set_immediate_exit(true);
1571 }
1572
1573 let mut res = Ok(exit_state);
1574 guest_os.irq_chip.kick_halted_vcpus();
1575 let _ = exit_evt.signal();
1576
1577 if guest_os.devices_thread.is_some() {
1578 if let Err(e) = device_ctrl_tube.send(&DeviceControlCommand::Exit) {
1579 error!("failed to stop device control loop: {}", e);
1580 };
1581 if let Some(thread) = guest_os.devices_thread.take() {
1582 if let Err(e) = thread.join() {
1583 error!("failed to exit devices thread: {:?}", e);
1584 }
1585 }
1586 }
1587
1588 // Shut down the VM memory handler thread.
1589 if let Err(e) = vm_memory_handler_control.send(&VmMemoryHandlerRequest::Exit) {
1590 error!(
1591 "failed to request exit from VM memory handler thread: {}",
1592 e
1593 );
1594 }
1595 if let Err(e) = vm_memory_handler_thread_join_handle.join() {
1596 error!("failed to exit VM Memory handler thread: {:?}", e);
1597 }
1598
1599 // Shut down the IRQ handler thread.
1600 if let Err(e) = irq_handler_control.send(&IrqHandlerRequest::Exit) {
1601 error!("failed to request exit from IRQ handler thread: {}", e);
1602 }
1603
1604 // Ensure any child threads have ended by sending the Exit vm event (possibly again) to ensure
1605 // their run loops are aborted.
1606 let _ = vm_evt_wrtube.send::<VmEventType>(&VmEventType::Exit);
1607 for (i, thread) in vcpu_threads.into_iter().enumerate() {
1608 // wait till all the threads exit, so that guest_os.vm arc memory count is down to 1.
1609 // otherwise, we will hit a memory leak if we force kill the thread with terminate.
1610 match thread.join() {
1611 Ok(Err(e)) => {
1612 error!("vcpu thread {} exited with an error: {}", i, e);
1613 res = Err(e);
1614 }
1615 Ok(_) => {}
1616 Err(e) => error!("vcpu thread {} panicked: {:?}", i, e),
1617 }
1618 }
1619
1620 info!("vCPU threads have exited.");
1621
1622 if let Some(ime) = ime_thread {
1623 match ime.join() {
1624 Ok(Err(e)) => {
1625 error!("ime thread exited with an error: {}", e);
1626 if res.is_ok() {
1627 // Prioritize past errors, but return this error if it is unique, otherwise just
1628 // log it.
1629 res = Err(e)
1630 }
1631 }
1632 Ok(_) => {}
1633 Err(e) => error!("ime thread panicked: {:?}", e),
1634 }
1635 }
1636 info!("IME thread has exited.");
1637
1638 // This cancels all the outstanding and any future blocking operations.
1639 // TODO(b/196911556): Shutdown executor for cleaner shutdown. Given we are using global, for a
1640 // cleaner shutdown we have to call disarm so that all the incoming requests are run and are
1641 // cancelled. If we call shutdown all blocking threads will go away and incoming operations
1642 // won't be scheduled to run and will be dropped leading to panic. I think ideal place to call
1643 // shutdown is when we drop non-global executor.
1644 cros_async::unblock_disarm();
1645 info!("blocking async pool has shut down.");
1646
1647 let _ = irq_join_handle.join();
1648 info!("IrqWaitWorker has shut down.");
1649
1650 #[cfg(feature = "stats")]
1651 if let Some(stats) = stats {
1652 println!("Statistics Collected:\n{}", stats.lock());
1653 println!("Statistics JSON:\n{}", stats.lock().json());
1654 }
1655
1656 if let Some(mode) = original_terminal_mode {
1657 if let Err(e) = stdin().restore_mode(mode) {
1658 warn!("failed to restore terminal mode: {}", e);
1659 }
1660 }
1661
1662 // Explicitly drop the VM structure here to allow the devices to clean up before the
1663 // control tubes are closed when this function exits.
1664 mem::drop(guest_os);
1665
1666 info!("guest_os dropped, run_control is done.");
1667
1668 res
1669 }
1670
1671 /// Remove Tubes that have been closed from the WaitContext.
remove_closed_tubes<T, U>( wait_ctx: &WaitContext<T>, tubes: &mut BTreeMap<usize, U>, mut tube_ids_to_remove: Vec<usize>, ) -> anyhow::Result<()> where T: EventToken, U: ReadNotifier + CloseNotifier,1672 fn remove_closed_tubes<T, U>(
1673 wait_ctx: &WaitContext<T>,
1674 tubes: &mut BTreeMap<usize, U>,
1675 mut tube_ids_to_remove: Vec<usize>,
1676 ) -> anyhow::Result<()>
1677 where
1678 T: EventToken,
1679 U: ReadNotifier + CloseNotifier,
1680 {
1681 tube_ids_to_remove.dedup();
1682 for id in tube_ids_to_remove {
1683 if let Some(socket) = tubes.remove(&id) {
1684 wait_ctx
1685 .delete(socket.get_read_notifier())
1686 .context("failed to remove descriptor from wait context")?;
1687
1688 // There may be a close notifier registered for this Tube. If there isn't one
1689 // registered, we just ignore the error.
1690 let _ = wait_ctx.delete(socket.get_close_notifier());
1691 }
1692 }
1693 Ok(())
1694 }
1695
1696 /// Sends a message to all VCPUs.
kick_all_vcpus( run_mode: &VcpuRunMode, vcpu_control_channels: &[mpsc::Sender<VcpuControl>], vcpu_boxes: &Mutex<Vec<Box<dyn VcpuArch>>>, irq_chip: &dyn IrqChipArch, #[cfg(feature = "pvclock")] pvclock_host_tube: &Option<Tube>, resume_notify_devices: &[Arc<Mutex<dyn BusResumeDevice>>], msg: VcpuControl, )1697 fn kick_all_vcpus(
1698 run_mode: &VcpuRunMode,
1699 vcpu_control_channels: &[mpsc::Sender<VcpuControl>],
1700 vcpu_boxes: &Mutex<Vec<Box<dyn VcpuArch>>>,
1701 irq_chip: &dyn IrqChipArch,
1702 #[cfg(feature = "pvclock")] pvclock_host_tube: &Option<Tube>,
1703 resume_notify_devices: &[Arc<Mutex<dyn BusResumeDevice>>],
1704 msg: VcpuControl,
1705 ) {
1706 // On Windows, we handle run mode switching directly rather than delegating to the VCPU thread
1707 // like unix does.
1708 match &msg {
1709 VcpuControl::RunState(VmRunMode::Suspending) => {
1710 suspend_all_vcpus(
1711 run_mode,
1712 vcpu_boxes,
1713 irq_chip,
1714 #[cfg(feature = "pvclock")]
1715 pvclock_host_tube,
1716 );
1717 return;
1718 }
1719 VcpuControl::RunState(VmRunMode::Running) => {
1720 for device in resume_notify_devices {
1721 device.lock().resume_imminent();
1722 }
1723 resume_all_vcpus(
1724 run_mode,
1725 vcpu_boxes,
1726 irq_chip,
1727 #[cfg(feature = "pvclock")]
1728 pvclock_host_tube,
1729 );
1730 return;
1731 }
1732 _ => (),
1733 }
1734
1735 // For non RunState commands, we dispatch just like unix would.
1736 for vcpu in vcpu_control_channels {
1737 if let Err(e) = vcpu.send(msg.clone()) {
1738 error!("failed to send VcpuControl message: {}", e);
1739 }
1740 }
1741
1742 // Now that we've sent a message, we need VCPUs to exit so they can process it.
1743 for vcpu in vcpu_boxes.lock().iter() {
1744 vcpu.set_immediate_exit(true);
1745 }
1746 irq_chip.kick_halted_vcpus();
1747
1748 // If the VCPU isn't running, we have to notify the run_mode condvar to wake it so it processes
1749 // the control message.
1750 let current_run_mode = run_mode.get_mode();
1751 if current_run_mode != VmRunMode::Running {
1752 run_mode.set_and_notify(current_run_mode);
1753 }
1754 }
1755
1756 /// Sends a message to a single VCPU. On Windows, `VcpuControl::RunState` cannot be sent to a single
1757 /// VCPU.
kick_vcpu( run_mode: &VcpuRunMode, vcpu_control_channels: &[mpsc::Sender<VcpuControl>], vcpu_boxes: &Mutex<Vec<Box<dyn VcpuArch>>>, irq_chip: &dyn IrqChipArch, index: usize, msg: VcpuControl, )1758 fn kick_vcpu(
1759 run_mode: &VcpuRunMode,
1760 vcpu_control_channels: &[mpsc::Sender<VcpuControl>],
1761 vcpu_boxes: &Mutex<Vec<Box<dyn VcpuArch>>>,
1762 irq_chip: &dyn IrqChipArch,
1763 index: usize,
1764 msg: VcpuControl,
1765 ) {
1766 assert!(
1767 !matches!(msg, VcpuControl::RunState(_)),
1768 "Windows does not support RunState changes on a per VCPU basis"
1769 );
1770
1771 let vcpu = vcpu_control_channels
1772 .get(index)
1773 .expect("invalid vcpu index specified");
1774 if let Err(e) = vcpu.send(msg) {
1775 error!("failed to send VcpuControl message: {}", e);
1776 }
1777
1778 // Now that we've sent a message, we need the VCPU to exit so it can
1779 // process the message.
1780 vcpu_boxes
1781 .lock()
1782 .get(index)
1783 .expect("invalid vcpu index specified")
1784 .set_immediate_exit(true);
1785 irq_chip.kick_halted_vcpus();
1786
1787 // If the VCPU isn't running, we have to notify the run_mode condvar to wake it so it processes
1788 // the control message. (Technically this wakes all VCPUs, but those without messages will go
1789 // back to sleep.)
1790 let current_run_mode = run_mode.get_mode();
1791 if current_run_mode != VmRunMode::Running {
1792 run_mode.set_and_notify(current_run_mode);
1793 }
1794 }
1795
1796 /// Suspends all VCPUs. The VM will be effectively frozen in time once this function is called,
1797 /// though devices on the host will continue to run.
suspend_all_vcpus( run_mode: &VcpuRunMode, vcpu_boxes: &Mutex<Vec<Box<dyn VcpuArch>>>, irq_chip: &dyn IrqChipArch, #[cfg(feature = "pvclock")] pvclock_host_tube: &Option<Tube>, )1798 pub(crate) fn suspend_all_vcpus(
1799 run_mode: &VcpuRunMode,
1800 vcpu_boxes: &Mutex<Vec<Box<dyn VcpuArch>>>,
1801 irq_chip: &dyn IrqChipArch,
1802 #[cfg(feature = "pvclock")] pvclock_host_tube: &Option<Tube>,
1803 ) {
1804 // VCPU threads MUST see the VmRunMode::Suspending flag first, otherwise
1805 // they may re-enter the VM.
1806 run_mode.set_and_notify(VmRunMode::Suspending);
1807
1808 // Force all vcpus to exit from the hypervisor
1809 for vcpu in vcpu_boxes.lock().iter() {
1810 vcpu.set_immediate_exit(true);
1811 }
1812 irq_chip.kick_halted_vcpus();
1813
1814 #[cfg(feature = "pvclock")]
1815 handle_pvclock_request(pvclock_host_tube, PvClockCommand::Suspend)
1816 .unwrap_or_else(|e| error!("Error handling pvclock suspend: {:?}", e));
1817 }
1818
1819 /// Resumes all VCPUs.
resume_all_vcpus( run_mode: &VcpuRunMode, vcpu_boxes: &Mutex<Vec<Box<dyn VcpuArch>>>, irq_chip: &dyn IrqChipArch, #[cfg(feature = "pvclock")] pvclock_host_tube: &Option<Tube>, )1820 pub(crate) fn resume_all_vcpus(
1821 run_mode: &VcpuRunMode,
1822 vcpu_boxes: &Mutex<Vec<Box<dyn VcpuArch>>>,
1823 irq_chip: &dyn IrqChipArch,
1824 #[cfg(feature = "pvclock")] pvclock_host_tube: &Option<Tube>,
1825 ) {
1826 #[cfg(feature = "pvclock")]
1827 handle_pvclock_request(pvclock_host_tube, PvClockCommand::Resume)
1828 .unwrap_or_else(|e| error!("Error handling pvclock resume: {:?}", e));
1829
1830 // Make sure any immediate exit bits are disabled
1831 for vcpu in vcpu_boxes.lock().iter() {
1832 vcpu.set_immediate_exit(false);
1833 }
1834
1835 run_mode.set_and_notify(VmRunMode::Running);
1836 }
1837
1838 #[cfg(feature = "gvm")]
1839 const GVM_MINIMUM_VERSION: GvmVersion = GvmVersion {
1840 major: 1,
1841 minor: 4,
1842 patch: 1,
1843 };
1844
1845 #[cfg(feature = "gvm")]
create_gvm_vm(gvm: Gvm, mem: GuestMemory) -> Result<GvmVm>1846 fn create_gvm_vm(gvm: Gvm, mem: GuestMemory) -> Result<GvmVm> {
1847 match gvm.get_full_version() {
1848 Ok(version) => {
1849 if version < GVM_MINIMUM_VERSION {
1850 error!(
1851 "GVM version {} is below minimum version {}",
1852 version, GVM_MINIMUM_VERSION
1853 );
1854 return Err(base::Error::new(libc::ENXIO).into());
1855 } else {
1856 info!("Using GVM version {}.", version)
1857 }
1858 }
1859 Err(e) => {
1860 error!("unable to determine gvm version: {}", e);
1861 return Err(base::Error::new(libc::ENXIO).into());
1862 }
1863 }
1864 let vm = GvmVm::new(&gvm, mem)?;
1865 Ok(vm)
1866 }
1867
1868 #[cfg(feature = "haxm")]
create_haxm_vm( haxm: Haxm, mem: GuestMemory, kernel_log_file: &Option<String>, ) -> Result<HaxmVm>1869 fn create_haxm_vm(
1870 haxm: Haxm,
1871 mem: GuestMemory,
1872 kernel_log_file: &Option<String>,
1873 ) -> Result<HaxmVm> {
1874 let vm = HaxmVm::new(&haxm, mem)?;
1875 if let Some(path) = kernel_log_file {
1876 use hypervisor::haxm::HAX_CAP_VM_LOG;
1877 if vm.check_raw_capability(HAX_CAP_VM_LOG) {
1878 match vm.register_log_file(path) {
1879 Ok(_) => {}
1880 Err(e) => match e.errno() {
1881 libc::E2BIG => {
1882 error!(
1883 "kernel_log_file path is too long, kernel log file will not be written"
1884 );
1885 }
1886 _ => return Err(e.into()),
1887 },
1888 }
1889 } else {
1890 warn!(
1891 "kernel_log_file specified but this version of HAXM does not support kernel log \
1892 files"
1893 );
1894 }
1895 }
1896 Ok(vm)
1897 }
1898
1899 #[cfg(feature = "whpx")]
1900 #[cfg(target_arch = "x86_64")]
create_whpx_vm( whpx: Whpx, mem: GuestMemory, cpu_count: usize, no_smt: bool, apic_emulation: bool, force_calibrated_tsc_leaf: bool, vm_evt_wrtube: SendTube, ) -> Result<WhpxVm>1901 fn create_whpx_vm(
1902 whpx: Whpx,
1903 mem: GuestMemory,
1904 cpu_count: usize,
1905 no_smt: bool,
1906 apic_emulation: bool,
1907 force_calibrated_tsc_leaf: bool,
1908 vm_evt_wrtube: SendTube,
1909 ) -> Result<WhpxVm> {
1910 let cpu_config = hypervisor::CpuConfigX86_64::new(
1911 force_calibrated_tsc_leaf,
1912 false, /* host_cpu_topology */
1913 false, /* enable_hwp */
1914 no_smt,
1915 false, /* itmt */
1916 None, /* hybrid_type */
1917 );
1918
1919 // context for non-cpu-specific cpuid results
1920 let ctx = CpuIdContext::new(
1921 0,
1922 cpu_count,
1923 None,
1924 cpu_config,
1925 whpx.check_capability(HypervisorCap::CalibratedTscLeafRequired),
1926 __cpuid_count,
1927 __cpuid,
1928 );
1929
1930 // Get all cpuid entries that we should pre-set
1931 let mut cpuid = whpx.get_supported_cpuid()?;
1932
1933 // Adjust them for crosvm
1934 for entry in cpuid.cpu_id_entries.iter_mut() {
1935 adjust_cpuid(entry, &ctx);
1936 }
1937
1938 let vm = WhpxVm::new(
1939 &whpx,
1940 cpu_count,
1941 mem,
1942 cpuid,
1943 apic_emulation,
1944 Some(vm_evt_wrtube),
1945 )
1946 .exit_context(Exit::WhpxSetupError, "failed to create WHPX vm")?;
1947
1948 Ok(vm)
1949 }
1950
1951 #[cfg(feature = "gvm")]
create_gvm_irq_chip(vm: &GvmVm, vcpu_count: usize) -> base::Result<GvmIrqChip>1952 fn create_gvm_irq_chip(vm: &GvmVm, vcpu_count: usize) -> base::Result<GvmIrqChip> {
1953 info!("Creating GVM irqchip");
1954 let irq_chip = GvmIrqChip::new(vm.try_clone()?, vcpu_count)?;
1955 Ok(irq_chip)
1956 }
1957
1958 #[cfg(feature = "whpx")]
1959 #[cfg(target_arch = "x86_64")]
create_whpx_split_irq_chip( vm: &WhpxVm, ioapic_device_tube: Tube, ) -> base::Result<WhpxSplitIrqChip>1960 fn create_whpx_split_irq_chip(
1961 vm: &WhpxVm,
1962 ioapic_device_tube: Tube,
1963 ) -> base::Result<WhpxSplitIrqChip> {
1964 info!("Creating WHPX split irqchip");
1965 WhpxSplitIrqChip::new(
1966 vm.try_clone()?,
1967 ioapic_device_tube,
1968 None, // ioapic_pins
1969 )
1970 }
1971
create_userspace_irq_chip<Vcpu>( vcpu_count: usize, ioapic_device_tube: Tube, ) -> base::Result<UserspaceIrqChip<Vcpu>> where Vcpu: VcpuArch + 'static,1972 fn create_userspace_irq_chip<Vcpu>(
1973 vcpu_count: usize,
1974 ioapic_device_tube: Tube,
1975 ) -> base::Result<UserspaceIrqChip<Vcpu>>
1976 where
1977 Vcpu: VcpuArch + 'static,
1978 {
1979 info!("Creating userspace irqchip");
1980 let irq_chip =
1981 UserspaceIrqChip::new(vcpu_count, ioapic_device_tube, /* ioapic_pins: */ None)?;
1982 Ok(irq_chip)
1983 }
1984
get_default_hypervisor() -> Option<HypervisorKind>1985 pub fn get_default_hypervisor() -> Option<HypervisorKind> {
1986 // The ordering here matters from most preferable to the least.
1987 #[cfg(feature = "whpx")]
1988 match hypervisor::whpx::Whpx::is_enabled() {
1989 true => return Some(HypervisorKind::Whpx),
1990 false => warn!("Whpx not enabled."),
1991 };
1992
1993 #[cfg(feature = "haxm")]
1994 match Haxm::new() {
1995 Ok(_) => return Some(HypervisorKind::Ghaxm),
1996 Err(e) => warn!("Cannot initialize HAXM: {}", e),
1997 };
1998
1999 #[cfg(feature = "gvm")]
2000 // Make sure Gvm device can be opened before selecting it.
2001 match Gvm::new() {
2002 Ok(_) => return Some(HypervisorKind::Gvm),
2003 Err(e) => warn!("Cannot initialize GVM: {}", e),
2004 };
2005
2006 None
2007 }
2008
setup_vm_components(cfg: &Config) -> Result<VmComponents>2009 fn setup_vm_components(cfg: &Config) -> Result<VmComponents> {
2010 let initrd_image = if let Some(initrd_path) = &cfg.initrd_path {
2011 Some(
2012 File::open(initrd_path).with_exit_context(Exit::OpenInitrd, || {
2013 format!("failed to open initrd {}", initrd_path.display())
2014 })?,
2015 )
2016 } else {
2017 None
2018 };
2019
2020 let vm_image = match cfg.executable_path {
2021 Some(Executable::Kernel(ref kernel_path)) => VmImage::Kernel(
2022 File::open(kernel_path).with_exit_context(Exit::OpenKernel, || {
2023 format!("failed to open kernel image {}", kernel_path.display(),)
2024 })?,
2025 ),
2026 Some(Executable::Bios(ref bios_path)) => {
2027 VmImage::Bios(File::open(bios_path).with_exit_context(Exit::OpenBios, || {
2028 format!("failed to open bios {}", bios_path.display())
2029 })?)
2030 }
2031 _ => panic!("Did not receive a bios or kernel, should be impossible."),
2032 };
2033
2034 let swiotlb = if let Some(size) = cfg.swiotlb {
2035 Some(
2036 size.checked_mul(1024 * 1024)
2037 .ok_or_else(|| anyhow!("requested swiotlb size too large"))?,
2038 )
2039 } else if matches!(cfg.protection_type, ProtectionType::Unprotected) {
2040 None
2041 } else {
2042 Some(64 * 1024 * 1024)
2043 };
2044
2045 let (pflash_image, pflash_block_size) = if let Some(pflash_parameters) = &cfg.pflash_parameters
2046 {
2047 (
2048 Some(
2049 open_file_or_duplicate(
2050 &pflash_parameters.path,
2051 OpenOptions::new().read(true).write(true),
2052 )
2053 .with_context(|| {
2054 format!("failed to open pflash {}", pflash_parameters.path.display())
2055 })?,
2056 ),
2057 pflash_parameters.block_size,
2058 )
2059 } else {
2060 (None, 0)
2061 };
2062
2063 Ok(VmComponents {
2064 memory_size: cfg
2065 .memory
2066 .unwrap_or(256)
2067 .checked_mul(1024 * 1024)
2068 .ok_or_else(|| anyhow!("requested memory size too large"))?,
2069 swiotlb,
2070 vcpu_count: cfg.vcpu_count.unwrap_or(1),
2071 fw_cfg_enable: false,
2072 bootorder_fw_cfg_blob: Vec::new(),
2073 vcpu_affinity: cfg.vcpu_affinity.clone(),
2074 cpu_clusters: cfg.cpu_clusters.clone(),
2075 cpu_capacity: cfg.cpu_capacity.clone(),
2076 no_smt: cfg.no_smt,
2077 hugepages: cfg.hugepages,
2078 hv_cfg: hypervisor::Config {
2079 protection_type: cfg.protection_type,
2080 },
2081 vm_image,
2082 android_fstab: cfg
2083 .android_fstab
2084 .as_ref()
2085 .map(|x| {
2086 File::open(x).with_exit_context(Exit::OpenAndroidFstab, || {
2087 format!("failed to open android fstab file {}", x.display())
2088 })
2089 })
2090 .map_or(Ok(None), |v| v.map(Some))?,
2091 pstore: cfg.pstore.clone(),
2092 pflash_block_size,
2093 pflash_image,
2094 initrd_image,
2095 extra_kernel_params: cfg.params.clone(),
2096 acpi_sdts: cfg
2097 .acpi_tables
2098 .iter()
2099 .map(|path| {
2100 SDT::from_file(path).with_exit_context(Exit::OpenAcpiTable, || {
2101 format!("failed to open ACPI file {}", path.display())
2102 })
2103 })
2104 .collect::<Result<Vec<SDT>>>()?,
2105 rt_cpus: cfg.rt_cpus.clone(),
2106 delay_rt: cfg.delay_rt,
2107 no_i8042: cfg.no_i8042,
2108 no_rtc: cfg.no_rtc,
2109 host_cpu_topology: cfg.host_cpu_topology,
2110 #[cfg(target_arch = "x86_64")]
2111 force_s2idle: cfg.force_s2idle,
2112 fw_cfg_parameters: cfg.fw_cfg_parameters.clone(),
2113 itmt: false,
2114 pvm_fw: None,
2115 pci_config: cfg.pci_config,
2116 #[cfg(target_arch = "x86_64")]
2117 smbios: cfg.smbios.clone(),
2118 dynamic_power_coefficient: cfg.dynamic_power_coefficient.clone(),
2119 #[cfg(target_arch = "x86_64")]
2120 break_linux_pci_config_io: cfg.break_linux_pci_config_io,
2121 boot_cpu: cfg.boot_cpu,
2122 })
2123 }
2124
2125 // Enum that allows us to assign a variable to what is essentially a &dyn IrqChipArch.
2126 enum WindowsIrqChip<V: VcpuArch> {
2127 Userspace(UserspaceIrqChip<V>),
2128 #[cfg(feature = "gvm")]
2129 Gvm(GvmIrqChip),
2130 #[cfg(feature = "whpx")]
2131 WhpxSplit(WhpxSplitIrqChip),
2132 }
2133
2134 impl<V: VcpuArch> WindowsIrqChip<V> {
2135 // Convert our enum to a &mut dyn IrqChipArch
as_mut(&mut self) -> &mut dyn IrqChipArch2136 fn as_mut(&mut self) -> &mut dyn IrqChipArch {
2137 match self {
2138 WindowsIrqChip::Userspace(i) => i,
2139 #[cfg(feature = "gvm")]
2140 WindowsIrqChip::Gvm(i) => i,
2141 #[cfg(feature = "whpx")]
2142 WindowsIrqChip::WhpxSplit(i) => i,
2143 }
2144 }
2145 }
2146
2147 /// Storage for the VM TSC offset for each vcpu. Stored in a static because the tracing thread will
2148 /// need access to it when tracing is enabled.
2149 static TSC_OFFSETS: sync::Mutex<Vec<Option<u64>>> = sync::Mutex::new(Vec::new());
2150
2151 /// Save the TSC offset for a particular vcpu.
2152 ///
2153 /// After setting the TSC offset for a vcpu, this function checks the standard deviation of offsets
2154 /// for all the VCPUs and logs this information. If the TSC offsets differ too much between vcpus
2155 /// it can cause clock issues in the guest.
save_vcpu_tsc_offset(offset: u64, vcpu_id: usize)2156 pub fn save_vcpu_tsc_offset(offset: u64, vcpu_id: usize) {
2157 let offsets_copy = {
2158 let mut offsets = TSC_OFFSETS.lock();
2159 // make sure offsets vec is large enough before inserting
2160 let newlen = std::cmp::max(offsets.len(), vcpu_id + 1);
2161 offsets.resize(newlen, None);
2162 offsets[vcpu_id] = Some(offset);
2163
2164 offsets.clone()
2165 };
2166
2167 // do statistics on a clone of the offsets so we don't hold up other vcpus at this point
2168 info!(
2169 "TSC offset standard deviation is: {}",
2170 standard_deviation(
2171 &offsets_copy
2172 .iter()
2173 .filter(|x| x.is_some())
2174 .map(|x| x.unwrap() as u128)
2175 .collect::<Vec<u128>>()
2176 )
2177 );
2178 }
2179
2180 /// Get the TSC offset of any vcpu. It will pick the first non-None offset it finds in TSC_OFFSETS.
2181 #[cfg(feature = "perfetto")]
get_vcpu_tsc_offset() -> u642182 pub fn get_vcpu_tsc_offset() -> u64 {
2183 if let Some(offset) = TSC_OFFSETS.lock().iter().flatten().next() {
2184 return *offset;
2185 }
2186 0
2187 }
2188
2189 /// Callback that is registered with tracing crate, and will be called by the tracing thread when
2190 /// tracing is enabled or disabled. Regardless of whether tracing is being enabled or disabled for
2191 /// a given category or instance, we just emit a clock snapshot that maps the guest TSC to the
2192 /// host TSC. Redundant snapshots should not be a problem for perfetto.
2193 #[cfg(feature = "perfetto")]
set_tsc_clock_snapshot()2194 fn set_tsc_clock_snapshot() {
2195 let freq = match devices::tsc::tsc_frequency() {
2196 Err(e) => {
2197 error!(
2198 "Could not determine tsc frequency, unable to snapshot tsc offset: {}",
2199 e
2200 );
2201 return;
2202 }
2203 Ok(freq) => freq,
2204 };
2205
2206 // The offset is host-guest tsc value
2207 let offset = get_vcpu_tsc_offset();
2208 // Safe because _rdtsc takes no arguments;
2209 let host_tsc = unsafe { std::arch::x86_64::_rdtsc() };
2210 perfetto::snapshot_clock(perfetto::ClockSnapshot::new(
2211 // Technically our multiplier should be freq/1_000_000_000, but perfetto doesn't
2212 // support floating point multipliers yet. So for now we set the freq in Hz and rely
2213 // on the merge tool to fix it.
2214 perfetto::Clock::new(
2215 perfetto::BuiltinClock::Tsc as u32,
2216 host_tsc.wrapping_add(offset),
2217 )
2218 .set_multiplier(freq as u64),
2219 perfetto::Clock::new(
2220 // The host builtin clock ids are all offset from the guest ids by
2221 // HOST_GUEST_CLOCK_ID_OFFSET when the traces are merged. Because this snapshot
2222 // contains both a guest and host clock, we need to offset it before merge.
2223 perfetto::BuiltinClock::Tsc as u32 + cros_tracing::HOST_GUEST_CLOCK_ID_OFFSET,
2224 host_tsc,
2225 )
2226 .set_multiplier(freq as u64),
2227 ));
2228 }
2229
2230 /// Launches run_config for the broker, reading configuration from a TubeTransporter.
run_config_for_broker(raw_tube_transporter: RawDescriptor) -> Result<ExitState>2231 pub fn run_config_for_broker(raw_tube_transporter: RawDescriptor) -> Result<ExitState> {
2232 let tube_transporter =
2233 // SAFETY:
2234 // Safe because we know that raw_transport_tube is valid (passed by inheritance), and that
2235 // the blocking & framing modes are accurate because we create them ourselves in the broker.
2236 unsafe { TubeTransporterReader::from_raw_descriptor(raw_tube_transporter) };
2237
2238 let mut tube_data_list = tube_transporter
2239 .read_tubes()
2240 .exit_context(Exit::TubeTransporterInit, "failed to init tube transporter")?;
2241
2242 let bootstrap_tube = tube_data_list
2243 .get_tube(TubeToken::Bootstrap)
2244 .exit_context(Exit::TubeFailure, "failed to get bootstrap tube")?;
2245
2246 let mut cfg: Config = bootstrap_tube
2247 .recv::<Config>()
2248 .exit_context(Exit::TubeFailure, "failed to read bootstrap tube")?;
2249
2250 let startup_args: CommonChildStartupArgs = bootstrap_tube
2251 .recv::<CommonChildStartupArgs>()
2252 .exit_context(Exit::TubeFailure, "failed to read bootstrap tube")?;
2253 let _child_cleanup = common_child_setup(startup_args).exit_context(
2254 Exit::CommonChildSetupError,
2255 "failed to perform common child setup",
2256 )?;
2257
2258 cfg.broker_shutdown_event = Some(
2259 bootstrap_tube
2260 .recv::<Event>()
2261 .exit_context(Exit::TubeFailure, "failed to read bootstrap tube")?,
2262 );
2263 #[cfg(feature = "crash-report")]
2264 let crash_tube_map = bootstrap_tube
2265 .recv::<HashMap<ProcessType, Vec<SendTube>>>()
2266 .exit_context(Exit::TubeFailure, "failed to read bootstrap tube")?;
2267 #[cfg(feature = "crash-report")]
2268 crash_report::set_crash_tube_map(crash_tube_map);
2269
2270 let BrokerTubes {
2271 vm_evt_wrtube,
2272 vm_evt_rdtube,
2273 } = bootstrap_tube
2274 .recv::<BrokerTubes>()
2275 .exit_context(Exit::TubeFailure, "failed to read bootstrap tube")?;
2276
2277 run_config_inner(cfg, vm_evt_wrtube, vm_evt_rdtube)
2278 }
2279
run_config(cfg: Config) -> Result<ExitState>2280 pub fn run_config(cfg: Config) -> Result<ExitState> {
2281 let _raise_timer_resolution = enable_high_res_timers()
2282 .exit_context(Exit::EnableHighResTimer, "failed to enable high res timer")?;
2283
2284 // There is no broker when using run_config(), so the vm_evt tubes need to be created.
2285 let (vm_evt_wrtube, vm_evt_rdtube) =
2286 Tube::directional_pair().context("failed to create vm event tube")?;
2287
2288 run_config_inner(cfg, vm_evt_wrtube, vm_evt_rdtube)
2289 }
2290
create_guest_memory( components: &VmComponents, arch_memory_layout: &<Arch as LinuxArch>::ArchMemoryLayout, hypervisor: &impl Hypervisor, ) -> Result<GuestMemory>2291 fn create_guest_memory(
2292 components: &VmComponents,
2293 arch_memory_layout: &<Arch as LinuxArch>::ArchMemoryLayout,
2294 hypervisor: &impl Hypervisor,
2295 ) -> Result<GuestMemory> {
2296 let guest_mem_layout = Arch::guest_memory_layout(components, arch_memory_layout, hypervisor)
2297 .exit_context(
2298 Exit::GuestMemoryLayout,
2299 "failed to create guest memory layout",
2300 )?;
2301 GuestMemory::new_with_options(&guest_mem_layout)
2302 .exit_context(Exit::CreateGuestMemory, "failed to create guest memory")
2303 }
2304
run_config_inner( cfg: Config, vm_evt_wrtube: SendTube, vm_evt_rdtube: RecvTube, ) -> Result<ExitState>2305 fn run_config_inner(
2306 cfg: Config,
2307 vm_evt_wrtube: SendTube,
2308 vm_evt_rdtube: RecvTube,
2309 ) -> Result<ExitState> {
2310 product::setup_common_metric_invariants(&cfg);
2311
2312 #[cfg(feature = "perfetto")]
2313 cros_tracing::add_per_trace_callback(set_tsc_clock_snapshot);
2314
2315 let components: VmComponents = setup_vm_components(&cfg)?;
2316 let arch_memory_layout = Arch::arch_memory_layout(&components)?;
2317
2318 #[allow(unused_mut)]
2319 let mut hypervisor = cfg
2320 .hypervisor
2321 .or_else(get_default_hypervisor)
2322 .exit_context(Exit::NoDefaultHypervisor, "no enabled hypervisor")?;
2323
2324 #[cfg(feature = "whpx")]
2325 if hypervisor::whpx::Whpx::is_enabled() {
2326 // If WHPX is enabled, no other hypervisor can be used, so just override it
2327 hypervisor = HypervisorKind::Whpx;
2328 }
2329
2330 match hypervisor {
2331 #[cfg(feature = "haxm")]
2332 HypervisorKind::Haxm | HypervisorKind::Ghaxm => {
2333 if hypervisor == HypervisorKind::Haxm {
2334 set_use_ghaxm(false);
2335 }
2336 info!("Creating HAXM ghaxm={}", get_use_ghaxm());
2337 let haxm = Haxm::new()?;
2338 let guest_mem = create_guest_memory(&components, &arch_memory_layout, &haxm)?;
2339 let vm = create_haxm_vm(haxm, guest_mem, &cfg.kernel_log_file)?;
2340 let (ioapic_host_tube, ioapic_device_tube) =
2341 Tube::pair().exit_context(Exit::CreateTube, "failed to create tube")?;
2342 let irq_chip =
2343 create_userspace_irq_chip::<HaxmVcpu>(components.vcpu_count, ioapic_device_tube)?;
2344 run_vm::<HaxmVcpu, HaxmVm>(
2345 cfg,
2346 components,
2347 &arch_memory_layout,
2348 vm,
2349 WindowsIrqChip::Userspace(irq_chip).as_mut(),
2350 Some(ioapic_host_tube),
2351 vm_evt_wrtube,
2352 vm_evt_rdtube,
2353 )
2354 }
2355 #[cfg(feature = "whpx")]
2356 HypervisorKind::Whpx => {
2357 let apic_emulation_supported =
2358 Whpx::check_whpx_feature(WhpxFeature::LocalApicEmulation)
2359 .exit_context(Exit::WhpxSetupError, "failed to set up whpx")?;
2360
2361 let no_smt = cfg.no_smt;
2362
2363 // Default to WhpxSplitIrqChip if it's supported because it's more performant
2364 let irq_chip = cfg.irq_chip.unwrap_or(if apic_emulation_supported {
2365 IrqChipKind::Split
2366 } else {
2367 IrqChipKind::Userspace
2368 });
2369
2370 // Both WHPX irq chips use a userspace IOAPIC
2371 let (ioapic_host_tube, ioapic_device_tube) =
2372 Tube::pair().exit_context(Exit::CreateTube, "failed to create tube")?;
2373
2374 info!("Creating Whpx");
2375 let whpx = Whpx::new()?;
2376 let guest_mem = create_guest_memory(&components, &arch_memory_layout, &whpx)?;
2377 let vm = create_whpx_vm(
2378 whpx,
2379 guest_mem,
2380 components.vcpu_count,
2381 no_smt,
2382 apic_emulation_supported && irq_chip == IrqChipKind::Split,
2383 cfg.force_calibrated_tsc_leaf,
2384 vm_evt_wrtube
2385 .try_clone()
2386 .expect("could not clone vm_evt_wrtube"),
2387 )?;
2388
2389 let mut irq_chip = match irq_chip {
2390 IrqChipKind::Kernel => unimplemented!("Kernel irqchip mode not supported by WHPX"),
2391 IrqChipKind::Split => {
2392 if !apic_emulation_supported {
2393 panic!(
2394 "split irqchip specified but your WHPX version does not support \
2395 local apic emulation"
2396 );
2397 }
2398 WindowsIrqChip::WhpxSplit(create_whpx_split_irq_chip(&vm, ioapic_device_tube)?)
2399 }
2400 IrqChipKind::Userspace => {
2401 WindowsIrqChip::Userspace(create_userspace_irq_chip::<WhpxVcpu>(
2402 components.vcpu_count,
2403 ioapic_device_tube,
2404 )?)
2405 }
2406 };
2407 run_vm::<WhpxVcpu, WhpxVm>(
2408 cfg,
2409 components,
2410 &arch_memory_layout,
2411 vm,
2412 irq_chip.as_mut(),
2413 Some(ioapic_host_tube),
2414 vm_evt_wrtube,
2415 vm_evt_rdtube,
2416 )
2417 }
2418 #[cfg(feature = "gvm")]
2419 HypervisorKind::Gvm => {
2420 info!("Creating GVM");
2421 let gvm = Gvm::new()?;
2422 let guest_mem = create_guest_memory(&components, &arch_memory_layout, &gvm)?;
2423 let vm = create_gvm_vm(gvm, guest_mem)?;
2424 let ioapic_host_tube;
2425 let mut irq_chip = match cfg.irq_chip.unwrap_or(IrqChipKind::Kernel) {
2426 IrqChipKind::Split => unimplemented!("Split irqchip mode not supported by GVM"),
2427 IrqChipKind::Kernel => {
2428 ioapic_host_tube = None;
2429 WindowsIrqChip::Gvm(create_gvm_irq_chip(&vm, components.vcpu_count)?)
2430 }
2431 IrqChipKind::Userspace => {
2432 let (host_tube, ioapic_device_tube) =
2433 Tube::pair().exit_context(Exit::CreateTube, "failed to create tube")?;
2434 ioapic_host_tube = Some(host_tube);
2435 WindowsIrqChip::Userspace(create_userspace_irq_chip::<GvmVcpu>(
2436 components.vcpu_count,
2437 ioapic_device_tube,
2438 )?)
2439 }
2440 };
2441 run_vm::<GvmVcpu, GvmVm>(
2442 cfg,
2443 components,
2444 &arch_memory_layout,
2445 vm,
2446 irq_chip.as_mut(),
2447 ioapic_host_tube,
2448 vm_evt_wrtube,
2449 vm_evt_rdtube,
2450 )
2451 }
2452 }
2453 }
2454
2455 #[cfg(any(feature = "haxm", feature = "gvm", feature = "whpx"))]
run_vm<Vcpu, V>( #[allow(unused_mut)] mut cfg: Config, #[allow(unused_mut)] mut components: VmComponents, arch_memory_layout: &<Arch as LinuxArch>::ArchMemoryLayout, mut vm: V, irq_chip: &mut dyn IrqChipArch, ioapic_host_tube: Option<Tube>, vm_evt_wrtube: SendTube, vm_evt_rdtube: RecvTube, ) -> Result<ExitState> where Vcpu: VcpuArch + 'static, V: VmArch + 'static,2456 fn run_vm<Vcpu, V>(
2457 #[allow(unused_mut)] mut cfg: Config,
2458 #[allow(unused_mut)] mut components: VmComponents,
2459 arch_memory_layout: &<Arch as LinuxArch>::ArchMemoryLayout,
2460 mut vm: V,
2461 irq_chip: &mut dyn IrqChipArch,
2462 ioapic_host_tube: Option<Tube>,
2463 vm_evt_wrtube: SendTube,
2464 vm_evt_rdtube: RecvTube,
2465 ) -> Result<ExitState>
2466 where
2467 Vcpu: VcpuArch + 'static,
2468 V: VmArch + 'static,
2469 {
2470 let vm_memory_size_mb = components.memory_size / (1024 * 1024);
2471 let mut control_tubes = Vec::new();
2472 let mut irq_control_tubes = Vec::new();
2473 let mut vm_memory_control_tubes = Vec::new();
2474 // Create one control tube per disk.
2475 let mut disk_device_tubes = Vec::new();
2476 let mut disk_host_tubes = Vec::new();
2477 let disk_count = cfg.disks.len();
2478 for _ in 0..disk_count {
2479 let (disk_host_tube, disk_device_tube) =
2480 Tube::pair().exit_context(Exit::CreateTube, "failed to create tube")?;
2481 disk_host_tubes.push(disk_host_tube);
2482 disk_device_tubes.push(disk_device_tube);
2483 }
2484
2485 if let Some(ioapic_host_tube) = ioapic_host_tube {
2486 irq_control_tubes.push(ioapic_host_tube);
2487 }
2488
2489 // Balloon gets a special socket so balloon requests can be forwarded from the main process.
2490 let (balloon_host_tube, balloon_device_tube) = if cfg.balloon {
2491 let (balloon_host_tube, balloon_device_tube) =
2492 Tube::pair().exit_context(Exit::CreateTube, "failed to create tube")?;
2493 (Some(balloon_host_tube), Some(balloon_device_tube))
2494 } else {
2495 (None, None)
2496 };
2497 // The balloon device also needs a tube to communicate back to the main process to
2498 // handle remapping memory dynamically.
2499 let dynamic_mapping_device_tube = if cfg.balloon {
2500 let (dynamic_mapping_host_tube, dynamic_mapping_device_tube) =
2501 Tube::pair().exit_context(Exit::CreateTube, "failed to create tube")?;
2502 vm_memory_control_tubes.push(dynamic_mapping_host_tube);
2503 Some(dynamic_mapping_device_tube)
2504 } else {
2505 None
2506 };
2507
2508 // PvClock gets a tube for handling suspend/resume requests from the main thread.
2509 #[cfg(feature = "pvclock")]
2510 let (pvclock_host_tube, pvclock_device_tube) = if cfg.pvclock {
2511 let (host, device) =
2512 Tube::pair().exit_context(Exit::CreateTube, "failed to create tube")?;
2513 (Some(host), Some(device))
2514 } else {
2515 (None, None)
2516 };
2517
2518 let gralloc = RutabagaGralloc::new(RutabagaGrallocBackendFlags::new())
2519 .exit_context(Exit::CreateGralloc, "failed to create gralloc")?;
2520
2521 let pstore_size = components.pstore.as_ref().map(|pstore| pstore.size as u64);
2522 let mut sys_allocator = SystemAllocator::new(
2523 Arch::get_system_allocator_config(&vm, arch_memory_layout),
2524 pstore_size,
2525 &cfg.mmio_address_ranges,
2526 )
2527 .context("failed to create system allocator")?;
2528
2529 // Allocate the ramoops region first.
2530 let ramoops_region = match &components.pstore {
2531 Some(pstore) => Some(
2532 arch::pstore::create_memory_region(
2533 &mut vm,
2534 sys_allocator.reserved_region().unwrap(),
2535 pstore,
2536 )
2537 .exit_context(
2538 Exit::Pstore,
2539 format!("failed to allocate pstore region {:?}", &components.pstore),
2540 )?,
2541 ),
2542 None => None,
2543 };
2544
2545 let init_balloon_size = components
2546 .memory_size
2547 .checked_sub(cfg.init_memory.map_or(components.memory_size, |m| {
2548 m.checked_mul(1024 * 1024).unwrap_or(u64::MAX)
2549 }))
2550 .context("failed to calculate init balloon size")?;
2551
2552 let tsc_state = devices::tsc::tsc_state().exit_code(Exit::TscCalibrationFailed)?;
2553 let tsc_sync_mitigations = get_tsc_sync_mitigations(&tsc_state, components.vcpu_count);
2554
2555 if tsc_state.core_grouping.size() > 1 {
2556 // Host TSCs are not in sync, log a metric about it.
2557 warn!(
2558 "Host TSCs are not in sync, applying the following mitigations: {:?}",
2559 tsc_sync_mitigations
2560 );
2561 log_descriptor(
2562 MetricEventType::TscCoresOutOfSync,
2563 // casting u64 as i64 is a no-op, so we don't lose any part of the bitmask
2564 tsc_state.core_grouping.core_grouping_bitmask() as i64,
2565 );
2566 }
2567
2568 #[cfg(feature = "gpu")]
2569 let gpu_control_tube = cfg
2570 .gpu_vmm_config
2571 .as_mut()
2572 .and_then(|config| config.gpu_control_host_tube.take());
2573 let product_args = product::get_run_control_args(&mut cfg);
2574
2575 // We open these files before lowering the token, as in the future a stricter policy may
2576 // prevent it.
2577 let dt_overlays = cfg
2578 .device_tree_overlay
2579 .iter()
2580 .map(|o| {
2581 Ok(DtbOverlay {
2582 file: open_file_or_duplicate(o.path.as_path(), OpenOptions::new().read(true))
2583 .with_context(|| {
2584 format!("failed to open device tree overlay {}", o.path.display())
2585 })?,
2586 })
2587 })
2588 .collect::<Result<Vec<DtbOverlay>>>()?;
2589
2590 // Lower the token, locking the main process down to a stricter security policy.
2591 //
2592 // WARNING:
2593 //
2594 // Windows system calls can behave in unusual ways if they happen concurrently to the token
2595 // lowering. For example, access denied can happen if Tube pairs are created in another thread
2596 // (b/281108137), and lower_token happens right before the client pipe is connected. Tubes are
2597 // not privileged resources, but can be broken due to the token changing unexpectedly.
2598 //
2599 // We explicitly lower the token here and *then* call run_control to make it clear that any
2600 // resources that require a privileged token should be created on the main thread & passed into
2601 // run_control, to follow the correct order:
2602 // - Privileged resources are created.
2603 // - Token is lowered.
2604 // - Threads are spawned & may create more non-privileged resources (without fear of the token
2605 // changing at an undefined time).
2606 //
2607 // Recommendation: If you find your code doesnt work in run_control because of the sandbox, you
2608 // should split any resource creation to before this token lowering & pass the resources into
2609 // run_control. Don't move the token lowering somewhere else without considering multi-threaded
2610 // effects.
2611 #[cfg(feature = "sandbox")]
2612 if sandbox::is_sandbox_target() {
2613 sandbox::TargetServices::get()
2614 .exit_code_from_err("failed to create sandbox")?
2615 .expect("Could not create sandbox!")
2616 .lower_token();
2617 }
2618
2619 let virtio_snd_state_device_tube = create_snd_state_tube(&mut control_tubes)?;
2620
2621 let (virtio_snd_host_mute_tube, virtio_snd_device_mute_tube) = create_snd_mute_tube_pair()?;
2622
2623 let mut initial_audio_session_states: Vec<InitialAudioSessionState> = Vec::new();
2624
2625 let pci_devices = create_devices(
2626 &mut cfg,
2627 vm.get_memory(),
2628 &vm_evt_wrtube,
2629 &mut irq_control_tubes,
2630 &mut vm_memory_control_tubes,
2631 &mut control_tubes,
2632 &mut disk_device_tubes,
2633 &mut initial_audio_session_states,
2634 balloon_device_tube,
2635 #[cfg(feature = "pvclock")]
2636 pvclock_device_tube,
2637 dynamic_mapping_device_tube,
2638 /* inflate_tube= */ None,
2639 init_balloon_size,
2640 tsc_state.frequency,
2641 virtio_snd_state_device_tube,
2642 virtio_snd_device_mute_tube,
2643 )?;
2644
2645 let mut vcpu_ids = Vec::new();
2646
2647 let (vwmdt_host_tube, vmwdt_device_tube) = Tube::pair().context("failed to create tube")?;
2648 let windows = Arch::build_vm::<V, Vcpu>(
2649 components,
2650 arch_memory_layout,
2651 &vm_evt_wrtube,
2652 &mut sys_allocator,
2653 &cfg.serial_parameters,
2654 None,
2655 (cfg.battery_config.as_ref().map(|t| t.type_), None),
2656 vm,
2657 ramoops_region,
2658 pci_devices,
2659 irq_chip,
2660 &mut vcpu_ids,
2661 cfg.dump_device_tree_blob.clone(),
2662 /* debugcon_jail= */ None,
2663 None,
2664 None,
2665 /* guest_suspended_cvar= */ None,
2666 dt_overlays,
2667 cfg.fdt_position,
2668 cfg.no_pmu,
2669 )
2670 .exit_context(Exit::BuildVm, "the architecture failed to build the vm")?;
2671
2672 #[cfg(feature = "stats")]
2673 let stats = if cfg.exit_stats {
2674 Some(Arc::new(Mutex::new(StatisticsCollector::new())))
2675 } else {
2676 None
2677 };
2678
2679 run_control(
2680 windows,
2681 sys_allocator,
2682 control_tubes,
2683 irq_control_tubes,
2684 vm_memory_control_tubes,
2685 vm_evt_rdtube,
2686 vm_evt_wrtube,
2687 #[cfg(feature = "gpu")]
2688 gpu_control_tube,
2689 cfg.broker_shutdown_event.take(),
2690 balloon_host_tube,
2691 #[cfg(feature = "pvclock")]
2692 pvclock_host_tube,
2693 disk_host_tubes,
2694 initial_audio_session_states,
2695 gralloc,
2696 #[cfg(feature = "stats")]
2697 stats,
2698 cfg.service_pipe_name,
2699 vm_memory_size_mb,
2700 cfg.host_cpu_topology,
2701 tsc_sync_mitigations,
2702 cfg.force_calibrated_tsc_leaf,
2703 product_args,
2704 match virtio_snd_host_mute_tube {
2705 Some(virtio_snd_host_mute_tube) => vec![virtio_snd_host_mute_tube],
2706 None => vec![],
2707 },
2708 cfg.restore_path,
2709 cfg.socket_path,
2710 cfg.force_s2idle,
2711 cfg.suspended,
2712 )
2713 }
2714
2715 #[cfg(test)]
2716 mod tests {
2717 use tempfile::TempDir;
2718
2719 use super::*;
2720
create_config(test_dir: &TempDir) -> Config2721 fn create_config(test_dir: &TempDir) -> Config {
2722 let mut config = Config::default();
2723
2724 let dummy_kernel_path = test_dir.path().join("dummy_kernel.txt");
2725 OpenOptions::new()
2726 .create_new(true)
2727 .write(true)
2728 .open(&dummy_kernel_path)
2729 .expect("Could not open file!");
2730 config.executable_path = Some(Executable::Kernel(dummy_kernel_path));
2731
2732 config
2733 }
2734
2735 #[test]
2736 #[should_panic(expected = "Did not receive a bios or kernel")]
setup_vm_components_panics_when_no_kernel_provided()2737 fn setup_vm_components_panics_when_no_kernel_provided() {
2738 let mut config =
2739 create_config(&TempDir::new().expect("Could not create temporary directory!"));
2740 config.executable_path = None;
2741 let _ = setup_vm_components(&config);
2742 }
2743
2744 #[test]
setup_vm_components_stores_memory_in_bytes()2745 fn setup_vm_components_stores_memory_in_bytes() {
2746 let tempdir = TempDir::new().expect("Could not create temporary directory!");
2747 let mut config = create_config(&tempdir);
2748 config.memory = Some(1);
2749 let vm_components = setup_vm_components(&config).expect("failed to setup vm components");
2750 assert_eq!(vm_components.memory_size, 1024 * 1024);
2751 }
2752
2753 #[test]
setup_vm_components_fails_when_memory_too_large()2754 fn setup_vm_components_fails_when_memory_too_large() {
2755 let tempdir = TempDir::new().expect("Could not create temporary directory!");
2756 let mut config = create_config(&tempdir);
2757 // One mb more than a u64 can hold in bytes
2758 config.memory = Some((u64::MAX / 1024 / 1024) + 1);
2759 setup_vm_components(&config).err().expect("expected error");
2760 }
2761 }
2762