• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2022 The ChromiumOS Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 // TODO(b:240716507): There is huge chunk for code which depends on haxm, whpx or gvm to be enabled
6 // but isn't marked so. Remove this when we do so.
7 #![allow(dead_code, unused_imports, unused_variables, unreachable_code)]
8 
9 pub(crate) mod control_server;
10 pub(crate) mod irq_wait;
11 pub(crate) mod main;
12 #[cfg(not(feature = "crash-report"))]
13 mod panic_hook;
14 
15 mod generic;
16 use generic as product;
17 pub(crate) mod run_vcpu;
18 
19 #[cfg(feature = "whpx")]
20 use std::arch::x86_64::__cpuid;
21 #[cfg(feature = "whpx")]
22 use std::arch::x86_64::__cpuid_count;
23 use std::cmp::Reverse;
24 use std::collections::BTreeMap;
25 use std::collections::HashMap;
26 use std::fs::File;
27 use std::fs::OpenOptions;
28 use std::io::stdin;
29 use std::iter;
30 use std::mem;
31 use std::os::windows::fs::OpenOptionsExt;
32 use std::path::PathBuf;
33 use std::sync::mpsc;
34 use std::sync::Arc;
35 
36 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
37 use aarch64::AArch64 as Arch;
38 use acpi_tables::sdt::SDT;
39 use anyhow::anyhow;
40 use anyhow::bail;
41 use anyhow::Context;
42 use anyhow::Result;
43 use arch::CpuConfigArch;
44 use arch::DtbOverlay;
45 use arch::IrqChipArch;
46 use arch::LinuxArch;
47 use arch::RunnableLinuxVm;
48 use arch::VcpuArch;
49 use arch::VirtioDeviceStub;
50 use arch::VmArch;
51 use arch::VmComponents;
52 use arch::VmImage;
53 use base::enable_high_res_timers;
54 use base::error;
55 use base::info;
56 use base::open_file_or_duplicate;
57 use base::warn;
58 use base::AsRawDescriptor;
59 #[cfg(feature = "gpu")]
60 use base::BlockingMode;
61 use base::CloseNotifier;
62 use base::Event;
63 use base::EventToken;
64 use base::EventType;
65 use base::FlushOnDropTube;
66 #[cfg(feature = "gpu")]
67 use base::FramingMode;
68 use base::FromRawDescriptor;
69 use base::ProtoTube;
70 use base::RawDescriptor;
71 use base::ReadNotifier;
72 use base::RecvTube;
73 use base::SendTube;
74 #[cfg(feature = "gpu")]
75 use base::StreamChannel;
76 use base::Terminal;
77 use base::TriggeredEvent;
78 use base::Tube;
79 use base::TubeError;
80 use base::VmEventType;
81 use base::WaitContext;
82 use broker_ipc::common_child_setup;
83 use broker_ipc::CommonChildStartupArgs;
84 use control_server::ControlServer;
85 use crosvm_cli::sys::windows::exit::Exit;
86 use crosvm_cli::sys::windows::exit::ExitContext;
87 use crosvm_cli::sys::windows::exit::ExitContextAnyhow;
88 use crosvm_cli::sys::windows::exit::ExitContextOption;
89 use devices::create_devices_worker_thread;
90 use devices::serial_device::SerialHardware;
91 use devices::serial_device::SerialParameters;
92 use devices::tsc::get_tsc_sync_mitigations;
93 use devices::tsc::standard_deviation;
94 use devices::tsc::TscSyncMitigations;
95 use devices::virtio;
96 use devices::virtio::block::DiskOption;
97 #[cfg(feature = "audio")]
98 use devices::virtio::snd::common_backend::VirtioSnd;
99 #[cfg(feature = "audio")]
100 use devices::virtio::snd::parameters::Parameters as SndParameters;
101 #[cfg(feature = "gpu")]
102 use devices::virtio::vhost::user::device::gpu::sys::windows::GpuVmmConfig;
103 #[cfg(feature = "gpu")]
104 use devices::virtio::vhost::user::device::gpu::sys::windows::InputEventSplitConfig;
105 #[cfg(feature = "gpu")]
106 use devices::virtio::vhost::user::device::gpu::sys::windows::InputEventVmmConfig;
107 #[cfg(feature = "gpu")]
108 use devices::virtio::vhost::user::gpu::sys::windows::product::GpuBackendConfig as GpuBackendConfigProduct;
109 #[cfg(feature = "gpu")]
110 use devices::virtio::vhost::user::gpu::sys::windows::run_gpu_device_worker;
111 #[cfg(feature = "audio")]
112 use devices::virtio::vhost::user::snd::sys::windows::product::SndBackendConfig as SndBackendConfigProduct;
113 #[cfg(feature = "audio")]
114 use devices::virtio::vhost::user::snd::sys::windows::run_snd_device_worker;
115 #[cfg(feature = "audio")]
116 use devices::virtio::vhost::user::snd::sys::windows::SndSplitConfig;
117 #[cfg(feature = "balloon")]
118 use devices::virtio::BalloonFeatures;
119 use devices::virtio::Console;
120 #[cfg(feature = "gpu")]
121 use devices::virtio::GpuParameters;
122 use devices::BusDeviceObj;
123 use devices::BusResumeDevice;
124 #[cfg(feature = "gvm")]
125 use devices::GvmIrqChip;
126 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
127 use devices::IrqChip;
128 use devices::UserspaceIrqChip;
129 use devices::VcpuRunState;
130 use devices::VirtioPciDevice;
131 #[cfg(feature = "whpx")]
132 use devices::WhpxSplitIrqChip;
133 #[cfg(feature = "gpu")]
134 use gpu_display::EventDevice;
135 #[cfg(feature = "gpu")]
136 use gpu_display::WindowProcedureThread;
137 #[cfg(feature = "gpu")]
138 use gpu_display::WindowProcedureThreadBuilder;
139 #[cfg(feature = "gvm")]
140 use hypervisor::gvm::Gvm;
141 #[cfg(feature = "gvm")]
142 use hypervisor::gvm::GvmVcpu;
143 #[cfg(feature = "gvm")]
144 use hypervisor::gvm::GvmVersion;
145 #[cfg(feature = "gvm")]
146 use hypervisor::gvm::GvmVm;
147 #[cfg(feature = "haxm")]
148 use hypervisor::haxm::get_use_ghaxm;
149 #[cfg(feature = "haxm")]
150 use hypervisor::haxm::set_use_ghaxm;
151 #[cfg(feature = "haxm")]
152 use hypervisor::haxm::Haxm;
153 #[cfg(feature = "haxm")]
154 use hypervisor::haxm::HaxmVcpu;
155 #[cfg(feature = "haxm")]
156 use hypervisor::haxm::HaxmVm;
157 #[cfg(feature = "whpx")]
158 use hypervisor::whpx::Whpx;
159 #[cfg(feature = "whpx")]
160 use hypervisor::whpx::WhpxFeature;
161 #[cfg(feature = "whpx")]
162 use hypervisor::whpx::WhpxVcpu;
163 #[cfg(feature = "whpx")]
164 use hypervisor::whpx::WhpxVm;
165 use hypervisor::Hypervisor;
166 #[cfg(feature = "whpx")]
167 use hypervisor::HypervisorCap;
168 #[cfg(feature = "whpx")]
169 use hypervisor::HypervisorX86_64;
170 use hypervisor::ProtectionType;
171 use hypervisor::Vm;
172 use irq_wait::IrqWaitWorker;
173 use jail::FakeMinijailStub as Minijail;
174 #[cfg(not(feature = "crash-report"))]
175 pub(crate) use panic_hook::set_panic_hook;
176 use product::create_snd_mute_tube_pair;
177 #[cfg(any(feature = "haxm", feature = "gvm", feature = "whpx"))]
178 use product::create_snd_state_tube;
179 #[cfg(feature = "pvclock")]
180 use product::handle_pvclock_request;
181 use product::merge_session_invariants;
182 use product::run_ime_thread;
183 use product::set_package_name;
184 pub(crate) use product::setup_metrics_reporting;
185 use product::start_service_ipc_listener;
186 use product::RunControlArgs;
187 use product::ServiceVmState;
188 use product::Token;
189 use resources::SystemAllocator;
190 use run_vcpu::run_all_vcpus;
191 use run_vcpu::VcpuRunMode;
192 use rutabaga_gfx::RutabagaGralloc;
193 use rutabaga_gfx::RutabagaGrallocBackendFlags;
194 use smallvec::SmallVec;
195 use sync::Mutex;
196 use tube_transporter::TubeToken;
197 use tube_transporter::TubeTransporterReader;
198 use vm_control::api::VmMemoryClient;
199 #[cfg(feature = "balloon")]
200 use vm_control::BalloonControlCommand;
201 #[cfg(feature = "balloon")]
202 use vm_control::BalloonTube;
203 use vm_control::DeviceControlCommand;
204 use vm_control::InitialAudioSessionState;
205 use vm_control::IrqHandlerRequest;
206 use vm_control::PvClockCommand;
207 use vm_control::VcpuControl;
208 use vm_control::VmMemoryRegionState;
209 use vm_control::VmMemoryRequest;
210 use vm_control::VmRequest;
211 use vm_control::VmResponse;
212 use vm_control::VmRunMode;
213 use vm_memory::GuestAddress;
214 use vm_memory::GuestMemory;
215 use vmm_vhost::Connection;
216 use vmm_vhost::FrontendReq;
217 use win_util::ProcessType;
218 #[cfg(feature = "whpx")]
219 use x86_64::cpuid::adjust_cpuid;
220 #[cfg(feature = "whpx")]
221 use x86_64::cpuid::CpuIdContext;
222 #[cfg(target_arch = "x86_64")]
223 use x86_64::X8664arch as Arch;
224 
225 use crate::crosvm::config::Config;
226 use crate::crosvm::config::Executable;
227 use crate::crosvm::config::InputDeviceOption;
228 #[cfg(any(feature = "gvm", feature = "whpx"))]
229 use crate::crosvm::config::IrqChipKind;
230 #[cfg(feature = "gpu")]
231 use crate::crosvm::config::TouchDeviceOption;
232 use crate::crosvm::config::DEFAULT_TOUCH_DEVICE_HEIGHT;
233 use crate::crosvm::config::DEFAULT_TOUCH_DEVICE_WIDTH;
234 use crate::crosvm::sys::config::HypervisorKind;
235 use crate::crosvm::sys::windows::broker::BrokerTubes;
236 #[cfg(feature = "stats")]
237 use crate::crosvm::sys::windows::stats::StatisticsCollector;
238 #[cfg(feature = "gpu")]
239 pub(crate) use crate::sys::windows::product::get_gpu_product_configs;
240 #[cfg(feature = "audio")]
241 pub(crate) use crate::sys::windows::product::get_snd_product_configs;
242 #[cfg(feature = "gpu")]
243 pub(crate) use crate::sys::windows::product::get_window_procedure_thread_product_configs;
244 use crate::sys::windows::product::log_descriptor;
245 #[cfg(feature = "audio")]
246 pub(crate) use crate::sys::windows::product::num_input_sound_devices;
247 #[cfg(feature = "audio")]
248 pub(crate) use crate::sys::windows::product::num_input_sound_streams;
249 use crate::sys::windows::product::spawn_anti_tamper_thread;
250 use crate::sys::windows::product::MetricEventType;
251 
252 const DEFAULT_GUEST_CID: u64 = 3;
253 
254 // by default, if enabled, the balloon WS features will use 4 bins.
255 const VIRTIO_BALLOON_WS_DEFAULT_NUM_BINS: u8 = 4;
256 
257 enum TaggedControlTube {
258     Vm(FlushOnDropTube),
259     Product(product::TaggedControlTube),
260 }
261 
262 impl ReadNotifier for TaggedControlTube {
get_read_notifier(&self) -> &dyn AsRawDescriptor263     fn get_read_notifier(&self) -> &dyn AsRawDescriptor {
264         match self {
265             Self::Vm(tube) => tube.0.get_read_notifier(),
266             Self::Product(tube) => tube.get_read_notifier(),
267         }
268     }
269 }
270 
271 impl CloseNotifier for TaggedControlTube {
get_close_notifier(&self) -> &dyn AsRawDescriptor272     fn get_close_notifier(&self) -> &dyn AsRawDescriptor {
273         match self {
274             Self::Vm(tube) => tube.0.get_close_notifier(),
275             Self::Product(tube) => tube.get_close_notifier(),
276         }
277     }
278 }
279 
280 pub enum ExitState {
281     Reset,
282     Stop,
283     Crash,
284     #[allow(dead_code)]
285     GuestPanic,
286     WatchdogReset,
287 }
288 
289 type DeviceResult<T = VirtioDeviceStub> = Result<T>;
290 
create_vhost_user_block_device( cfg: &Config, connection: Connection<FrontendReq>, ) -> DeviceResult291 fn create_vhost_user_block_device(
292     cfg: &Config,
293     connection: Connection<FrontendReq>,
294 ) -> DeviceResult {
295     let dev = virtio::VhostUserFrontend::new(
296         virtio::DeviceType::Block,
297         virtio::base_features(cfg.protection_type),
298         connection,
299         None,
300         None,
301     )
302     .exit_context(
303         Exit::VhostUserBlockDeviceNew,
304         "failed to set up vhost-user block device",
305     )?;
306 
307     Ok(VirtioDeviceStub {
308         dev: Box::new(dev),
309         jail: None,
310     })
311 }
312 
create_block_device(cfg: &Config, disk: &DiskOption, disk_device_tube: Tube) -> DeviceResult313 fn create_block_device(cfg: &Config, disk: &DiskOption, disk_device_tube: Tube) -> DeviceResult {
314     let features = virtio::base_features(cfg.protection_type);
315     let dev = virtio::BlockAsync::new(
316         features,
317         disk.open()?,
318         disk,
319         Some(disk_device_tube),
320         None,
321         None,
322     )
323     .exit_context(Exit::BlockDeviceNew, "failed to create block device")?;
324 
325     Ok(VirtioDeviceStub {
326         dev: Box::new(dev),
327         jail: None,
328     })
329 }
330 
331 #[cfg(feature = "gpu")]
create_vhost_user_gpu_device( base_features: u64, connection: Connection<FrontendReq>, ) -> DeviceResult332 fn create_vhost_user_gpu_device(
333     base_features: u64,
334     connection: Connection<FrontendReq>,
335 ) -> DeviceResult {
336     let dev = virtio::VhostUserFrontend::new(
337         virtio::DeviceType::Gpu,
338         base_features,
339         connection,
340         None,
341         None,
342     )
343     .exit_context(
344         Exit::VhostUserGpuDeviceNew,
345         "failed to set up vhost-user gpu device",
346     )?;
347 
348     Ok(VirtioDeviceStub {
349         dev: Box::new(dev),
350         jail: None,
351     })
352 }
353 
354 #[cfg(feature = "audio")]
create_vhost_user_snd_device( base_features: u64, connection: Connection<FrontendReq>, ) -> DeviceResult355 fn create_vhost_user_snd_device(
356     base_features: u64,
357     connection: Connection<FrontendReq>,
358 ) -> DeviceResult {
359     let dev = virtio::VhostUserFrontend::new(
360         virtio::DeviceType::Sound,
361         base_features,
362         connection,
363         None,
364         None,
365     )
366     .exit_context(
367         Exit::VhostUserSndDeviceNew,
368         "failed to set up vhost-user snd device",
369     )?;
370 
371     Ok(VirtioDeviceStub {
372         dev: Box::new(dev),
373         jail: None,
374     })
375 }
376 
377 #[cfg(feature = "gpu")]
create_multi_touch_device( cfg: &Config, event_pipe: StreamChannel, width: u32, height: u32, name: Option<&str>, idx: u32, ) -> DeviceResult378 fn create_multi_touch_device(
379     cfg: &Config,
380     event_pipe: StreamChannel,
381     width: u32,
382     height: u32,
383     name: Option<&str>,
384     idx: u32,
385 ) -> DeviceResult {
386     let dev = virtio::input::new_multi_touch(
387         idx,
388         event_pipe,
389         width,
390         height,
391         name,
392         virtio::base_features(cfg.protection_type),
393     )
394     .exit_context(Exit::InputDeviceNew, "failed to set up input device")?;
395     Ok(VirtioDeviceStub {
396         dev: Box::new(dev),
397         jail: None,
398     })
399 }
400 
401 #[cfg(feature = "gpu")]
create_mouse_device(cfg: &Config, event_pipe: StreamChannel, idx: u32) -> DeviceResult402 fn create_mouse_device(cfg: &Config, event_pipe: StreamChannel, idx: u32) -> DeviceResult {
403     let dev = virtio::input::new_mouse(idx, event_pipe, virtio::base_features(cfg.protection_type))
404         .exit_context(Exit::InputDeviceNew, "failed to set up input device")?;
405     Ok(VirtioDeviceStub {
406         dev: Box::new(dev),
407         jail: None,
408     })
409 }
410 
411 #[cfg(feature = "slirp")]
create_vhost_user_net_device(cfg: &Config, connection: Connection<FrontendReq>) -> DeviceResult412 fn create_vhost_user_net_device(cfg: &Config, connection: Connection<FrontendReq>) -> DeviceResult {
413     let features = virtio::base_features(cfg.protection_type);
414     let dev =
415         virtio::VhostUserFrontend::new(virtio::DeviceType::Net, features, connection, None, None)
416             .exit_context(
417             Exit::VhostUserNetDeviceNew,
418             "failed to set up vhost-user net device",
419         )?;
420 
421     Ok(VirtioDeviceStub {
422         dev: Box::new(dev),
423         jail: None,
424     })
425 }
426 
create_rng_device(cfg: &Config) -> DeviceResult427 fn create_rng_device(cfg: &Config) -> DeviceResult {
428     let dev = virtio::Rng::new(virtio::base_features(cfg.protection_type))
429         .exit_context(Exit::RngDeviceNew, "failed to set up rng")?;
430 
431     Ok(VirtioDeviceStub {
432         dev: Box::new(dev),
433         jail: None,
434     })
435 }
436 
create_console_device(cfg: &Config, param: &SerialParameters) -> DeviceResult437 fn create_console_device(cfg: &Config, param: &SerialParameters) -> DeviceResult {
438     let mut keep_rds = Vec::new();
439     let evt = Event::new().exit_context(Exit::CreateEvent, "failed to create event")?;
440     let dev = param
441         .create_serial_device::<Console>(cfg.protection_type, &evt, &mut keep_rds)
442         .exit_context(Exit::CreateConsole, "failed to create console device")?;
443 
444     Ok(VirtioDeviceStub {
445         dev: Box::new(dev),
446         jail: None,
447     })
448 }
449 
450 #[cfg(feature = "balloon")]
create_balloon_device( cfg: &Config, balloon_device_tube: Tube, dynamic_mapping_device_tube: Tube, inflate_tube: Option<Tube>, init_balloon_size: u64, ) -> DeviceResult451 fn create_balloon_device(
452     cfg: &Config,
453     balloon_device_tube: Tube,
454     dynamic_mapping_device_tube: Tube,
455     inflate_tube: Option<Tube>,
456     init_balloon_size: u64,
457 ) -> DeviceResult {
458     let balloon_features =
459         (cfg.balloon_page_reporting as u64) << BalloonFeatures::PageReporting as u64;
460     let dev = virtio::Balloon::new(
461         virtio::base_features(cfg.protection_type),
462         balloon_device_tube,
463         VmMemoryClient::new(dynamic_mapping_device_tube),
464         inflate_tube,
465         init_balloon_size,
466         balloon_features,
467         #[cfg(feature = "registered_events")]
468         None,
469         VIRTIO_BALLOON_WS_DEFAULT_NUM_BINS,
470     )
471     .exit_context(Exit::BalloonDeviceNew, "failed to create balloon")?;
472 
473     Ok(VirtioDeviceStub {
474         dev: Box::new(dev),
475         jail: None,
476     })
477 }
478 
create_vsock_device(cfg: &Config) -> DeviceResult479 fn create_vsock_device(cfg: &Config) -> DeviceResult {
480     // We only support a single guest, so we can confidently assign a default
481     // CID if one isn't provided. We choose the lowest non-reserved value.
482     let dev = virtio::vsock::Vsock::new(
483         cfg.vsock
484             .as_ref()
485             .map(|cfg| cfg.cid)
486             .unwrap_or(DEFAULT_GUEST_CID),
487         cfg.host_guid.clone(),
488         virtio::base_features(cfg.protection_type),
489     )
490     .exit_context(
491         Exit::UserspaceVsockDeviceNew,
492         "failed to create userspace vsock device",
493     )?;
494 
495     Ok(VirtioDeviceStub {
496         dev: Box::new(dev),
497         jail: None,
498     })
499 }
500 
create_virtio_devices( cfg: &mut Config, vm_evt_wrtube: &SendTube, #[allow(clippy::ptr_arg)] control_tubes: &mut Vec<TaggedControlTube>, disk_device_tubes: &mut Vec<Tube>, initial_audio_session_states: &mut Vec<InitialAudioSessionState>, balloon_device_tube: Option<Tube>, #[cfg(feature = "pvclock")] pvclock_device_tube: Option<Tube>, dynamic_mapping_device_tube: Option<Tube>, inflate_tube: Option<Tube>, init_balloon_size: u64, tsc_frequency: u64, virtio_snd_state_device_tube: Option<Tube>, virtio_snd_control_device_tube: Option<Tube>, ) -> DeviceResult<Vec<VirtioDeviceStub>>501 fn create_virtio_devices(
502     cfg: &mut Config,
503     vm_evt_wrtube: &SendTube,
504     #[allow(clippy::ptr_arg)] control_tubes: &mut Vec<TaggedControlTube>,
505     disk_device_tubes: &mut Vec<Tube>,
506     initial_audio_session_states: &mut Vec<InitialAudioSessionState>,
507     balloon_device_tube: Option<Tube>,
508     #[cfg(feature = "pvclock")] pvclock_device_tube: Option<Tube>,
509     dynamic_mapping_device_tube: Option<Tube>,
510     inflate_tube: Option<Tube>,
511     init_balloon_size: u64,
512     tsc_frequency: u64,
513     virtio_snd_state_device_tube: Option<Tube>,
514     virtio_snd_control_device_tube: Option<Tube>,
515 ) -> DeviceResult<Vec<VirtioDeviceStub>> {
516     let mut devs = Vec::new();
517 
518     if cfg.block_vhost_user_tube.is_empty() {
519         // Disk devices must precede virtio-console devices or the kernel does not boot.
520         // TODO(b/171215421): figure out why this ordering is required and fix it.
521         for disk in &cfg.disks {
522             let disk_device_tube = disk_device_tubes.remove(0);
523             devs.push(create_block_device(cfg, disk, disk_device_tube)?);
524         }
525     } else {
526         info!("Starting up vhost user block backends...");
527         for _disk in &cfg.disks {
528             let disk_device_tube = cfg.block_vhost_user_tube.remove(0);
529             let connection = Connection::<FrontendReq>::from(disk_device_tube);
530             devs.push(create_vhost_user_block_device(cfg, connection)?);
531         }
532     }
533 
534     for (_, param) in cfg
535         .serial_parameters
536         .iter()
537         .filter(|(_k, v)| v.hardware == SerialHardware::VirtioConsole)
538     {
539         let dev = create_console_device(cfg, param)?;
540         devs.push(dev);
541     }
542 
543     #[cfg(feature = "audio")]
544     {
545         let snd_split_configs = std::mem::take(&mut cfg.snd_split_configs);
546         for mut snd_split_cfg in snd_split_configs.into_iter() {
547             devs.push(create_virtio_snd_device(
548                 cfg,
549                 &mut snd_split_cfg,
550                 control_tubes,
551             )?);
552             if let Some(vmm_config) = snd_split_cfg.vmm_config {
553                 let initial_audio_session_state = InitialAudioSessionState {
554                     audio_client_guid: vmm_config.audio_client_guid,
555                     card_index: vmm_config.card_index,
556                 };
557                 initial_audio_session_states.push(initial_audio_session_state);
558             }
559         }
560     }
561 
562     #[cfg(feature = "pvclock")]
563     if let Some(tube) = pvclock_device_tube {
564         product::push_pvclock_device(cfg, &mut devs, tsc_frequency, tube);
565     }
566 
567     devs.push(create_rng_device(cfg)?);
568 
569     #[cfg(feature = "slirp")]
570     if let Some(net_vhost_user_tube) = cfg.net_vhost_user_tube.take() {
571         let connection = Connection::<FrontendReq>::from(net_vhost_user_tube);
572         devs.push(create_vhost_user_net_device(cfg, connection)?);
573     }
574 
575     #[cfg(feature = "balloon")]
576     if let (Some(balloon_device_tube), Some(dynamic_mapping_device_tube)) =
577         (balloon_device_tube, dynamic_mapping_device_tube)
578     {
579         devs.push(create_balloon_device(
580             cfg,
581             balloon_device_tube,
582             dynamic_mapping_device_tube,
583             inflate_tube,
584             init_balloon_size,
585         )?);
586     }
587 
588     devs.push(create_vsock_device(cfg)?);
589 
590     #[cfg(feature = "gpu")]
591     let event_devices = if let Some(InputEventSplitConfig {
592         backend_config,
593         vmm_config,
594     }) = cfg.input_event_split_config.take()
595     {
596         devs.extend(
597             create_virtio_input_event_devices(cfg, vmm_config)
598                 .context("create input event devices")?,
599         );
600         backend_config.map(|cfg| cfg.event_devices)
601     } else {
602         None
603     };
604 
605     #[cfg(feature = "gpu")]
606     if let Some(wndproc_thread_vmm_config) = cfg
607         .window_procedure_thread_split_config
608         .as_mut()
609         .map(|split_cfg| &mut split_cfg.vmm_config)
610     {
611         product::push_window_procedure_thread_control_tubes(
612             control_tubes,
613             wndproc_thread_vmm_config,
614         );
615     }
616 
617     #[cfg(feature = "gpu")]
618     let mut wndproc_thread = cfg
619         .window_procedure_thread_split_config
620         .as_mut()
621         .and_then(|cfg| cfg.wndproc_thread_builder.take())
622         .map(WindowProcedureThreadBuilder::start_thread)
623         .transpose()
624         .context("Failed to start the window procedure thread.")?;
625 
626     #[cfg(feature = "gpu")]
627     if let Some(gpu_vmm_config) = cfg.gpu_vmm_config.take() {
628         devs.push(create_virtio_gpu_device(
629             cfg,
630             gpu_vmm_config,
631             event_devices,
632             &mut wndproc_thread,
633             control_tubes,
634         )?);
635     }
636 
637     Ok(devs)
638 }
639 
640 #[cfg(feature = "gpu")]
create_virtio_input_event_devices( cfg: &Config, mut input_event_vmm_config: InputEventVmmConfig, ) -> DeviceResult<Vec<VirtioDeviceStub>>641 fn create_virtio_input_event_devices(
642     cfg: &Config,
643     mut input_event_vmm_config: InputEventVmmConfig,
644 ) -> DeviceResult<Vec<VirtioDeviceStub>> {
645     let mut devs = Vec::new();
646 
647     // Iterate event devices, create the VMM end.
648     let mut multi_touch_pipes = input_event_vmm_config
649         .multi_touch_pipes
650         .drain(..)
651         .enumerate();
652     for input in &cfg.virtio_input {
653         match input {
654             InputDeviceOption::SingleTouch { .. } => {
655                 unimplemented!("--single-touch is no longer supported. Use --multi-touch instead.");
656             }
657             InputDeviceOption::MultiTouch {
658                 width,
659                 height,
660                 name,
661                 ..
662             } => {
663                 let Some((idx, pipe)) = multi_touch_pipes.next() else {
664                     break;
665                 };
666                 let mut width = *width;
667                 let mut height = *height;
668                 if idx == 0 {
669                     if width.is_none() {
670                         width = cfg.display_input_width;
671                     }
672                     if height.is_none() {
673                         height = cfg.display_input_height;
674                     }
675                 }
676                 devs.push(create_multi_touch_device(
677                     cfg,
678                     pipe,
679                     width.unwrap_or(DEFAULT_TOUCH_DEVICE_WIDTH),
680                     height.unwrap_or(DEFAULT_TOUCH_DEVICE_HEIGHT),
681                     name.as_deref(),
682                     idx as u32,
683                 )?);
684             }
685             _ => {}
686         }
687     }
688     drop(multi_touch_pipes);
689 
690     product::push_mouse_device(cfg, &mut input_event_vmm_config, &mut devs)?;
691 
692     for (idx, pipe) in input_event_vmm_config.mouse_pipes.drain(..).enumerate() {
693         devs.push(create_mouse_device(cfg, pipe, idx as u32)?);
694     }
695 
696     let keyboard_pipe = input_event_vmm_config
697         .keyboard_pipes
698         .pop()
699         .expect("at least one keyboard should be in GPU VMM config");
700     let dev = virtio::input::new_keyboard(
701         /* idx= */ 0,
702         keyboard_pipe,
703         virtio::base_features(cfg.protection_type),
704     )
705     .exit_context(Exit::InputDeviceNew, "failed to set up input device")?;
706 
707     devs.push(VirtioDeviceStub {
708         dev: Box::new(dev),
709         jail: None,
710     });
711 
712     Ok(devs)
713 }
714 
715 #[cfg(feature = "gpu")]
create_virtio_gpu_device( cfg: &mut Config, mut gpu_vmm_config: GpuVmmConfig, event_devices: Option<Vec<EventDevice>>, wndproc_thread: &mut Option<WindowProcedureThread>, #[allow(clippy::ptr_arg)] control_tubes: &mut Vec<TaggedControlTube>, ) -> DeviceResult<VirtioDeviceStub>716 fn create_virtio_gpu_device(
717     cfg: &mut Config,
718     mut gpu_vmm_config: GpuVmmConfig,
719     event_devices: Option<Vec<EventDevice>>,
720     wndproc_thread: &mut Option<WindowProcedureThread>,
721     #[allow(clippy::ptr_arg)] control_tubes: &mut Vec<TaggedControlTube>,
722 ) -> DeviceResult<VirtioDeviceStub> {
723     let resource_bridges = Vec::<Tube>::new();
724 
725     product::push_gpu_control_tubes(control_tubes, &mut gpu_vmm_config);
726 
727     // If the GPU backend is passed, start up the vhost-user worker in the main process.
728     if let Some(backend_config) = cfg.gpu_backend_config.take() {
729         let event_devices = event_devices.ok_or_else(|| {
730             anyhow!("event devices are missing when creating virtio-gpu in the current process.")
731         })?;
732         let wndproc_thread = wndproc_thread
733             .take()
734             .ok_or_else(|| anyhow!("Window procedure thread is missing."))?;
735 
736         std::thread::spawn(move || {
737             run_gpu_device_worker(backend_config, event_devices, wndproc_thread)
738         });
739     }
740 
741     // The GPU is always vhost-user, even if running in the main process.
742     let gpu_device_tube = gpu_vmm_config
743         .main_vhost_user_tube
744         .take()
745         .expect("GPU VMM vhost-user tube should be set");
746     let connection = Connection::<FrontendReq>::from(gpu_device_tube);
747 
748     create_vhost_user_gpu_device(virtio::base_features(cfg.protection_type), connection)
749         .context("create vhost-user GPU device")
750 }
751 
752 #[cfg(feature = "audio")]
create_virtio_snd_device( cfg: &mut Config, snd_split_config: &mut SndSplitConfig, #[allow(clippy::ptr_arg)] control_tubes: &mut Vec<TaggedControlTube>, ) -> DeviceResult<VirtioDeviceStub>753 fn create_virtio_snd_device(
754     cfg: &mut Config,
755     snd_split_config: &mut SndSplitConfig,
756     #[allow(clippy::ptr_arg)] control_tubes: &mut Vec<TaggedControlTube>,
757 ) -> DeviceResult<VirtioDeviceStub> {
758     let snd_vmm_config = snd_split_config
759         .vmm_config
760         .as_mut()
761         .expect("snd_vmm_config must exist");
762     product::push_snd_control_tubes(control_tubes, snd_vmm_config);
763 
764     // If the SND backend is passed, start up the vhost-user worker in the main process.
765     if let Some(backend_config) = snd_split_config.backend_config.take() {
766         std::thread::spawn(move || run_snd_device_worker(backend_config));
767     }
768 
769     // The SND is always vhost-user, even if running in the main process.
770     let snd_device_tube = snd_vmm_config
771         .main_vhost_user_tube
772         .take()
773         .expect("Snd VMM vhost-user tube should be set");
774     let connection = Connection::<FrontendReq>::from(snd_device_tube);
775 
776     create_vhost_user_snd_device(virtio::base_features(cfg.protection_type), connection)
777         .context("create vhost-user SND device")
778 }
779 
create_devices( cfg: &mut Config, mem: &GuestMemory, exit_evt_wrtube: &SendTube, irq_control_tubes: &mut Vec<Tube>, vm_memory_control_tubes: &mut Vec<Tube>, control_tubes: &mut Vec<TaggedControlTube>, disk_device_tubes: &mut Vec<Tube>, initial_audio_session_states: &mut Vec<InitialAudioSessionState>, balloon_device_tube: Option<Tube>, #[cfg(feature = "pvclock")] pvclock_device_tube: Option<Tube>, dynamic_mapping_device_tube: Option<Tube>, inflate_tube: Option<Tube>, init_balloon_size: u64, tsc_frequency: u64, virtio_snd_state_device_tube: Option<Tube>, virtio_snd_control_device_tube: Option<Tube>, ) -> DeviceResult<Vec<(Box<dyn BusDeviceObj>, Option<Minijail>)>>780 fn create_devices(
781     cfg: &mut Config,
782     mem: &GuestMemory,
783     exit_evt_wrtube: &SendTube,
784     irq_control_tubes: &mut Vec<Tube>,
785     vm_memory_control_tubes: &mut Vec<Tube>,
786     control_tubes: &mut Vec<TaggedControlTube>,
787     disk_device_tubes: &mut Vec<Tube>,
788     initial_audio_session_states: &mut Vec<InitialAudioSessionState>,
789     balloon_device_tube: Option<Tube>,
790     #[cfg(feature = "pvclock")] pvclock_device_tube: Option<Tube>,
791     dynamic_mapping_device_tube: Option<Tube>,
792     inflate_tube: Option<Tube>,
793     init_balloon_size: u64,
794     tsc_frequency: u64,
795     virtio_snd_state_device_tube: Option<Tube>,
796     virtio_snd_control_device_tube: Option<Tube>,
797 ) -> DeviceResult<Vec<(Box<dyn BusDeviceObj>, Option<Minijail>)>> {
798     let stubs = create_virtio_devices(
799         cfg,
800         exit_evt_wrtube,
801         control_tubes,
802         disk_device_tubes,
803         initial_audio_session_states,
804         balloon_device_tube,
805         #[cfg(feature = "pvclock")]
806         pvclock_device_tube,
807         dynamic_mapping_device_tube,
808         inflate_tube,
809         init_balloon_size,
810         tsc_frequency,
811         virtio_snd_state_device_tube,
812         virtio_snd_control_device_tube,
813     )?;
814 
815     let mut pci_devices = Vec::new();
816 
817     for stub in stubs {
818         let (msi_host_tube, msi_device_tube) =
819             Tube::pair().exit_context(Exit::CreateTube, "failed to create tube")?;
820         irq_control_tubes.push(msi_host_tube);
821 
822         let shared_memory_tube = if stub.dev.get_shared_memory_region().is_some() {
823             let (host_tube, device_tube) =
824                 Tube::pair().context("failed to create VVU proxy tube")?;
825             vm_memory_control_tubes.push(host_tube);
826             Some(device_tube)
827         } else {
828             None
829         };
830 
831         let (ioevent_host_tube, ioevent_device_tube) =
832             Tube::pair().context("failed to create ioevent tube")?;
833         vm_memory_control_tubes.push(ioevent_host_tube);
834 
835         let (vm_control_host_tube, vm_control_device_tube) =
836             Tube::pair().context("failed to create vm_control tube")?;
837         control_tubes.push(TaggedControlTube::Vm(FlushOnDropTube::from(
838             vm_control_host_tube,
839         )));
840 
841         let dev = Box::new(
842             VirtioPciDevice::new(
843                 mem.clone(),
844                 stub.dev,
845                 msi_device_tube,
846                 cfg.disable_virtio_intx,
847                 shared_memory_tube.map(VmMemoryClient::new),
848                 VmMemoryClient::new(ioevent_device_tube),
849                 vm_control_device_tube,
850             )
851             .exit_context(Exit::VirtioPciDev, "failed to create virtio pci dev")?,
852         ) as Box<dyn BusDeviceObj>;
853         pci_devices.push((dev, stub.jail));
854     }
855 
856     Ok(pci_devices)
857 }
858 
859 #[derive(Debug)]
860 struct PvClockError(String);
861 
handle_readable_event<V: VmArch + 'static, Vcpu: VcpuArch + 'static>( event: &TriggeredEvent<Token>, vm_control_ids_to_remove: &mut Vec<usize>, next_control_id: &mut usize, service_vm_state: &mut ServiceVmState, disk_host_tubes: &[Tube], ipc_main_loop_tube: Option<&Tube>, #[cfg(feature = "gpu")] gpu_control_tube: Option<&Tube>, vm_evt_rdtube: &RecvTube, control_tubes: &mut BTreeMap<usize, TaggedControlTube>, guest_os: &mut RunnableLinuxVm<V, Vcpu>, sys_allocator_mutex: &Arc<Mutex<SystemAllocator>>, virtio_snd_host_mute_tubes: &mut [Tube], proto_main_loop_tube: Option<&ProtoTube>, anti_tamper_main_thread_tube: &Option<ProtoTube>, #[cfg(feature = "balloon")] mut balloon_tube: Option<&mut BalloonTube>, memory_size_mb: u64, vcpu_boxes: &Mutex<Vec<Box<dyn VcpuArch>>>, #[cfg(feature = "pvclock")] pvclock_host_tube: &Option<Tube>, run_mode_arc: &VcpuRunMode, region_state: &mut VmMemoryRegionState, vm_control_server: Option<&mut ControlServer>, irq_handler_control: &Tube, device_ctrl_tube: &Tube, wait_ctx: &WaitContext<Token>, force_s2idle: bool, vcpu_control_channels: &[mpsc::Sender<VcpuControl>], suspended_pvclock_state: &mut Option<hypervisor::ClockState>, ) -> Result<Option<ExitState>>862 fn handle_readable_event<V: VmArch + 'static, Vcpu: VcpuArch + 'static>(
863     event: &TriggeredEvent<Token>,
864     vm_control_ids_to_remove: &mut Vec<usize>,
865     next_control_id: &mut usize,
866     service_vm_state: &mut ServiceVmState,
867     disk_host_tubes: &[Tube],
868     ipc_main_loop_tube: Option<&Tube>,
869     #[cfg(feature = "gpu")] gpu_control_tube: Option<&Tube>,
870     vm_evt_rdtube: &RecvTube,
871     control_tubes: &mut BTreeMap<usize, TaggedControlTube>,
872     guest_os: &mut RunnableLinuxVm<V, Vcpu>,
873     sys_allocator_mutex: &Arc<Mutex<SystemAllocator>>,
874     virtio_snd_host_mute_tubes: &mut [Tube],
875     proto_main_loop_tube: Option<&ProtoTube>,
876     anti_tamper_main_thread_tube: &Option<ProtoTube>,
877     #[cfg(feature = "balloon")] mut balloon_tube: Option<&mut BalloonTube>,
878     memory_size_mb: u64,
879     vcpu_boxes: &Mutex<Vec<Box<dyn VcpuArch>>>,
880     #[cfg(feature = "pvclock")] pvclock_host_tube: &Option<Tube>,
881     run_mode_arc: &VcpuRunMode,
882     region_state: &mut VmMemoryRegionState,
883     vm_control_server: Option<&mut ControlServer>,
884     irq_handler_control: &Tube,
885     device_ctrl_tube: &Tube,
886     wait_ctx: &WaitContext<Token>,
887     force_s2idle: bool,
888     vcpu_control_channels: &[mpsc::Sender<VcpuControl>],
889     suspended_pvclock_state: &mut Option<hypervisor::ClockState>,
890 ) -> Result<Option<ExitState>> {
891     let mut execute_vm_request = |request: VmRequest, guest_os: &mut RunnableLinuxVm<V, Vcpu>| {
892         if let VmRequest::Exit = request {
893             return (VmResponse::Ok, Some(VmRunMode::Exiting));
894         }
895         let vcpu_size = vcpu_boxes.lock().len();
896         let resp = request.execute(
897             &guest_os.vm,
898             disk_host_tubes,
899             &[],
900             &mut guest_os.pm,
901             #[cfg(feature = "gpu")]
902             gpu_control_tube,
903             #[cfg(not(feature = "gpu"))]
904             None,
905             None,
906             &mut None,
907             |msg| {
908                 kick_all_vcpus(
909                     run_mode_arc,
910                     vcpu_control_channels,
911                     vcpu_boxes,
912                     guest_os.irq_chip.as_ref(),
913                     #[cfg(feature = "pvclock")]
914                     pvclock_host_tube,
915                     &guest_os.resume_notify_devices,
916                     msg,
917                 );
918             },
919             force_s2idle,
920             #[cfg(feature = "swap")]
921             None,
922             device_ctrl_tube,
923             vcpu_size,
924             irq_handler_control,
925             || guest_os.irq_chip.as_ref().snapshot(vcpu_size),
926             suspended_pvclock_state,
927         );
928         (resp, None)
929     };
930 
931     match event.token {
932         Token::VmEvent => match vm_evt_rdtube.recv::<VmEventType>() {
933             Ok(vm_event) => {
934                 let exit_state = match vm_event {
935                     VmEventType::Exit => {
936                         info!("vcpu requested shutdown");
937                         Some(ExitState::Stop)
938                     }
939                     VmEventType::Reset => {
940                         info!("vcpu requested reset");
941                         Some(ExitState::Reset)
942                     }
943                     VmEventType::Crash => {
944                         info!("vcpu crashed");
945                         Some(ExitState::Crash)
946                     }
947                     VmEventType::Panic(_) => {
948                         error!("got pvpanic event. this event is not expected on Windows.");
949                         None
950                     }
951                     VmEventType::WatchdogReset => {
952                         info!("vcpu stall detected");
953                         Some(ExitState::WatchdogReset)
954                     }
955                 };
956                 return Ok(exit_state);
957             }
958             Err(e) => {
959                 warn!("failed to recv VmEvent: {}", e);
960             }
961         },
962         Token::BrokerShutdown => {
963             info!("main loop got broker shutdown event");
964             return Ok(Some(ExitState::Stop));
965         }
966         Token::VmControlServer => {
967             let server =
968                 vm_control_server.expect("control server must exist if this event triggers");
969             let client = server.accept();
970             let id = *next_control_id;
971             *next_control_id += 1;
972             wait_ctx
973                 .add(client.0.get_read_notifier(), Token::VmControl { id })
974                 .exit_context(
975                     Exit::WaitContextAdd,
976                     "failed to add trigger to wait context",
977                 )?;
978             wait_ctx
979                 .add(client.0.get_close_notifier(), Token::VmControl { id })
980                 .exit_context(
981                     Exit::WaitContextAdd,
982                     "failed to add trigger to wait context",
983                 )?;
984             control_tubes.insert(id, TaggedControlTube::Vm(client));
985         }
986         #[allow(clippy::collapsible_match)]
987         Token::VmControl { id } => {
988             if let Some(tube) = control_tubes.get(&id) {
989                 #[allow(clippy::single_match)]
990                 match tube {
991                     TaggedControlTube::Product(product_tube) => {
992                         product::handle_tagged_control_tube_event(
993                             product_tube,
994                             virtio_snd_host_mute_tubes,
995                             service_vm_state,
996                             ipc_main_loop_tube,
997                         )
998                     }
999                     TaggedControlTube::Vm(tube) => match tube.0.recv::<VmRequest>() {
1000                         Ok(request) => {
1001                             let mut run_mode_opt = None;
1002                             let response = match request {
1003                                 VmRequest::HotPlugVfioCommand { device, add } => {
1004                                     // Suppress warnings.
1005                                     let _ = (device, add);
1006                                     unimplemented!("not implemented on Windows");
1007                                 }
1008                                 #[cfg(feature = "registered_events")]
1009                                 VmRequest::RegisterListener { socket_addr, event } => {
1010                                     unimplemented!("not implemented on Windows");
1011                                 }
1012                                 #[cfg(feature = "registered_events")]
1013                                 VmRequest::UnregisterListener { socket_addr, event } => {
1014                                     unimplemented!("not implemented on Windows");
1015                                 }
1016                                 #[cfg(feature = "registered_events")]
1017                                 VmRequest::Unregister { socket_addr } => {
1018                                     unimplemented!("not implemented on Windows");
1019                                 }
1020                                 #[cfg(feature = "balloon")]
1021                                 VmRequest::BalloonCommand(cmd) => {
1022                                     if let Some(balloon_tube) = balloon_tube {
1023                                         if let Some((r, key)) = balloon_tube.send_cmd(cmd, Some(id))
1024                                         {
1025                                             if key != id {
1026                                                 unimplemented!("not implemented on Windows");
1027                                             }
1028                                             Some(r)
1029                                         } else {
1030                                             None
1031                                         }
1032                                     } else {
1033                                         error!("balloon not enabled");
1034                                         None
1035                                     }
1036                                 }
1037                                 _ => {
1038                                     let (resp, run_mode_ret) =
1039                                         execute_vm_request(request, guest_os);
1040                                     run_mode_opt = run_mode_ret;
1041                                     Some(resp)
1042                                 }
1043                             };
1044 
1045                             if let Some(response) = response {
1046                                 if let Err(e) = tube.0.send(&response) {
1047                                     error!("failed to send VmResponse: {}", e);
1048                                 }
1049                             }
1050                             if let Some(exit_state) =
1051                                 handle_run_mode_change_for_vm_request(&run_mode_opt, guest_os)
1052                             {
1053                                 return Ok(Some(exit_state));
1054                             }
1055                         }
1056                         Err(e) => {
1057                             if let TubeError::Disconnected = e {
1058                                 vm_control_ids_to_remove.push(id);
1059                             } else {
1060                                 error!("failed to recv VmRequest: {}", e);
1061                             }
1062                         }
1063                     },
1064                 }
1065             }
1066         }
1067         #[cfg(feature = "balloon")]
1068         Token::BalloonTube => match balloon_tube.as_mut().expect("missing balloon tube").recv() {
1069             Ok(resp) => {
1070                 for (resp, idx) in resp {
1071                     if let Some(TaggedControlTube::Vm(tube)) = control_tubes.get(&idx) {
1072                         if let Err(e) = tube.0.send(&resp) {
1073                             error!("failed to send VmResponse: {}", e);
1074                         }
1075                     } else {
1076                         error!("Bad tube index {}", idx);
1077                     }
1078                 }
1079             }
1080             Err(err) => {
1081                 error!("Error processing balloon tube {:?}", err)
1082             }
1083         },
1084         #[cfg(not(feature = "balloon"))]
1085         Token::BalloonTube => unreachable!("balloon tube not registered"),
1086         #[allow(unreachable_patterns)]
1087         _ => {
1088             let run_mode_opt = product::handle_received_token(
1089                 &event.token,
1090                 anti_tamper_main_thread_tube,
1091                 #[cfg(feature = "balloon")]
1092                 balloon_tube,
1093                 control_tubes,
1094                 guest_os,
1095                 ipc_main_loop_tube,
1096                 memory_size_mb,
1097                 proto_main_loop_tube,
1098                 #[cfg(feature = "pvclock")]
1099                 pvclock_host_tube,
1100                 run_mode_arc,
1101                 service_vm_state,
1102                 vcpu_boxes,
1103                 virtio_snd_host_mute_tubes,
1104                 execute_vm_request,
1105             );
1106             if let Some(exit_state) = handle_run_mode_change_for_vm_request(&run_mode_opt, guest_os)
1107             {
1108                 return Ok(Some(exit_state));
1109             }
1110         }
1111     };
1112     Ok(None)
1113 }
1114 
1115 /// Handles a run mode change (if one occurred) if one is pending as a
1116 /// result a VmRequest. The parameter, run_mode_opt, is the run mode change
1117 /// proposed by the VmRequest's execution.
1118 ///
1119 /// Returns the exit state, if it changed due to a run mode change.
1120 /// None otherwise.
handle_run_mode_change_for_vm_request<V: VmArch + 'static, Vcpu: VcpuArch + 'static>( run_mode_opt: &Option<VmRunMode>, guest_os: &mut RunnableLinuxVm<V, Vcpu>, ) -> Option<ExitState>1121 fn handle_run_mode_change_for_vm_request<V: VmArch + 'static, Vcpu: VcpuArch + 'static>(
1122     run_mode_opt: &Option<VmRunMode>,
1123     guest_os: &mut RunnableLinuxVm<V, Vcpu>,
1124 ) -> Option<ExitState> {
1125     if let Some(run_mode) = run_mode_opt {
1126         info!("control socket changed run mode to {}", run_mode);
1127         match run_mode {
1128             VmRunMode::Exiting => return Some(ExitState::Stop),
1129             _ => unreachable!(),
1130         }
1131     }
1132     // No exit state change.
1133     None
1134 }
1135 
1136 /// Commands to control the VM Memory handler thread.
1137 #[derive(serde::Serialize, serde::Deserialize)]
1138 pub enum VmMemoryHandlerRequest {
1139     /// No response is sent for this command.
1140     Exit,
1141 }
1142 
vm_memory_handler_thread( control_tubes: Vec<Tube>, mut vm: impl Vm, sys_allocator_mutex: Arc<Mutex<SystemAllocator>>, mut gralloc: RutabagaGralloc, handler_control: Tube, ) -> anyhow::Result<()>1143 fn vm_memory_handler_thread(
1144     control_tubes: Vec<Tube>,
1145     mut vm: impl Vm,
1146     sys_allocator_mutex: Arc<Mutex<SystemAllocator>>,
1147     mut gralloc: RutabagaGralloc,
1148     handler_control: Tube,
1149 ) -> anyhow::Result<()> {
1150     #[derive(EventToken)]
1151     enum Token {
1152         VmControl { id: usize },
1153         HandlerControl,
1154     }
1155 
1156     let wait_ctx =
1157         WaitContext::build_with(&[(handler_control.get_read_notifier(), Token::HandlerControl)])
1158             .context("failed to build wait context")?;
1159     let mut control_tubes = BTreeMap::from_iter(control_tubes.into_iter().enumerate());
1160     for (id, socket) in control_tubes.iter() {
1161         wait_ctx
1162             .add(socket.get_read_notifier(), Token::VmControl { id: *id })
1163             .context("failed to add descriptor to wait context")?;
1164     }
1165 
1166     let mut region_state: VmMemoryRegionState = Default::default();
1167 
1168     'wait: loop {
1169         let events = {
1170             match wait_ctx.wait() {
1171                 Ok(v) => v,
1172                 Err(e) => {
1173                     error!("failed to poll: {}", e);
1174                     break;
1175                 }
1176             }
1177         };
1178 
1179         let mut vm_control_ids_to_remove = Vec::new();
1180         for event in events.iter().filter(|e| e.is_readable) {
1181             match event.token {
1182                 Token::HandlerControl => match handler_control.recv::<VmMemoryHandlerRequest>() {
1183                     Ok(request) => match request {
1184                         VmMemoryHandlerRequest::Exit => break 'wait,
1185                     },
1186                     Err(e) => {
1187                         if let TubeError::Disconnected = e {
1188                             panic!("vm memory control tube disconnected.");
1189                         } else {
1190                             error!("failed to recv VmMemoryHandlerRequest: {}", e);
1191                         }
1192                     }
1193                 },
1194 
1195                 Token::VmControl { id } => {
1196                     if let Some(tube) = control_tubes.get(&id) {
1197                         match tube.recv::<VmMemoryRequest>() {
1198                             Ok(request) => {
1199                                 let response = request.execute(
1200                                     &mut vm,
1201                                     &mut sys_allocator_mutex.lock(),
1202                                     &mut gralloc,
1203                                     None,
1204                                     &mut region_state,
1205                                 );
1206                                 if let Err(e) = tube.send(&response) {
1207                                     error!("failed to send VmMemoryControlResponse: {}", e);
1208                                 }
1209                             }
1210                             Err(e) => {
1211                                 if let TubeError::Disconnected = e {
1212                                     vm_control_ids_to_remove.push(id);
1213                                 } else {
1214                                     error!("failed to recv VmMemoryControlRequest: {}", e);
1215                                 }
1216                             }
1217                         }
1218                     }
1219                 }
1220             }
1221         }
1222 
1223         remove_closed_tubes(&wait_ctx, &mut control_tubes, vm_control_ids_to_remove)?;
1224         if events
1225             .iter()
1226             .any(|e| e.is_hungup && !e.is_readable && matches!(e.token, Token::HandlerControl))
1227         {
1228             error!("vm memory handler control hung up but did not request an exit.");
1229             break 'wait;
1230         }
1231     }
1232     Ok(())
1233 }
1234 
create_control_server( control_server_path: Option<PathBuf>, wait_ctx: &WaitContext<Token>, ) -> Result<Option<ControlServer>>1235 fn create_control_server(
1236     control_server_path: Option<PathBuf>,
1237     wait_ctx: &WaitContext<Token>,
1238 ) -> Result<Option<ControlServer>> {
1239     #[cfg(not(feature = "prod-build"))]
1240     {
1241         if let Some(path) = control_server_path {
1242             let server =
1243                 ControlServer::new(path.to_str().expect("control socket path must be a string"))
1244                     .exit_context(
1245                         Exit::FailedToCreateControlServer,
1246                         "failed to create control server",
1247                     )?;
1248             wait_ctx
1249                 .add(server.client_waiting(), Token::VmControlServer)
1250                 .exit_context(
1251                     Exit::WaitContextAdd,
1252                     "failed to add control server to wait context",
1253                 )?;
1254             return Ok(Some(server));
1255         }
1256     }
1257     Ok::<Option<ControlServer>, anyhow::Error>(None)
1258 }
1259 
run_control<V: VmArch + 'static, Vcpu: VcpuArch + 'static>( mut guest_os: RunnableLinuxVm<V, Vcpu>, sys_allocator: SystemAllocator, control_tubes: Vec<TaggedControlTube>, irq_control_tubes: Vec<Tube>, vm_memory_control_tubes: Vec<Tube>, vm_evt_rdtube: RecvTube, vm_evt_wrtube: SendTube, #[cfg(feature = "gpu")] gpu_control_tube: Option<Tube>, broker_shutdown_evt: Option<Event>, balloon_host_tube: Option<Tube>, #[cfg(feature = "pvclock")] pvclock_host_tube: Option<Tube>, disk_host_tubes: Vec<Tube>, initial_audio_session_states: Vec<InitialAudioSessionState>, gralloc: RutabagaGralloc, #[cfg(feature = "stats")] stats: Option<Arc<Mutex<StatisticsCollector>>>, service_pipe_name: Option<String>, memory_size_mb: u64, host_cpu_topology: bool, tsc_sync_mitigations: TscSyncMitigations, force_calibrated_tsc_leaf: bool, mut product_args: RunControlArgs, mut virtio_snd_host_mute_tubes: Vec<Tube>, restore_path: Option<PathBuf>, control_server_path: Option<PathBuf>, force_s2idle: bool, suspended: bool, ) -> Result<ExitState>1260 fn run_control<V: VmArch + 'static, Vcpu: VcpuArch + 'static>(
1261     mut guest_os: RunnableLinuxVm<V, Vcpu>,
1262     sys_allocator: SystemAllocator,
1263     control_tubes: Vec<TaggedControlTube>,
1264     irq_control_tubes: Vec<Tube>,
1265     vm_memory_control_tubes: Vec<Tube>,
1266     vm_evt_rdtube: RecvTube,
1267     vm_evt_wrtube: SendTube,
1268     #[cfg(feature = "gpu")] gpu_control_tube: Option<Tube>,
1269     broker_shutdown_evt: Option<Event>,
1270     balloon_host_tube: Option<Tube>,
1271     #[cfg(feature = "pvclock")] pvclock_host_tube: Option<Tube>,
1272     disk_host_tubes: Vec<Tube>,
1273     initial_audio_session_states: Vec<InitialAudioSessionState>,
1274     gralloc: RutabagaGralloc,
1275     #[cfg(feature = "stats")] stats: Option<Arc<Mutex<StatisticsCollector>>>,
1276     service_pipe_name: Option<String>,
1277     memory_size_mb: u64,
1278     host_cpu_topology: bool,
1279     tsc_sync_mitigations: TscSyncMitigations,
1280     force_calibrated_tsc_leaf: bool,
1281     mut product_args: RunControlArgs,
1282     mut virtio_snd_host_mute_tubes: Vec<Tube>,
1283     restore_path: Option<PathBuf>,
1284     control_server_path: Option<PathBuf>,
1285     force_s2idle: bool,
1286     suspended: bool,
1287 ) -> Result<ExitState> {
1288     let (ipc_main_loop_tube, proto_main_loop_tube, _service_ipc) =
1289         start_service_ipc_listener(service_pipe_name)?;
1290 
1291     let mut service_vm_state = product::create_service_vm_state(memory_size_mb);
1292 
1293     let service_audio_states = product::create_service_audio_states_and_send_to_service(
1294         initial_audio_session_states,
1295         &ipc_main_loop_tube,
1296     )?;
1297 
1298     let sys_allocator_mutex = Arc::new(Mutex::new(sys_allocator));
1299 
1300     let exit_evt = Event::new().exit_context(Exit::CreateEvent, "failed to create event")?;
1301     let (irq_handler_control, irq_handler_control_for_worker) = Tube::pair().exit_context(
1302         Exit::CreateTube,
1303         "failed to create IRQ handler control Tube",
1304     )?;
1305 
1306     // Create a separate thread to wait on IRQ events. This is a natural division
1307     // because IRQ interrupts have no dependencies on other events, and this lets
1308     // us avoid approaching the Windows WaitForMultipleObjects 64-object limit.
1309     let irq_join_handle = IrqWaitWorker::start(
1310         irq_handler_control_for_worker,
1311         guest_os
1312             .irq_chip
1313             .try_box_clone()
1314             .exit_context(Exit::CloneEvent, "failed to clone irq chip")?,
1315         irq_control_tubes,
1316         sys_allocator_mutex.clone(),
1317     );
1318 
1319     let mut triggers = vec![(vm_evt_rdtube.get_read_notifier(), Token::VmEvent)];
1320     product::push_triggers(&mut triggers, &ipc_main_loop_tube, &proto_main_loop_tube);
1321     let wait_ctx = WaitContext::build_with(&triggers).exit_context(
1322         Exit::WaitContextAdd,
1323         "failed to add trigger to wait context",
1324     )?;
1325 
1326     #[cfg(feature = "balloon")]
1327     let mut balloon_tube = balloon_host_tube
1328         .map(|tube| -> Result<BalloonTube> {
1329             wait_ctx
1330                 .add(tube.get_read_notifier(), Token::BalloonTube)
1331                 .context("failed to add trigger to wait context")?;
1332             Ok(BalloonTube::new(tube))
1333         })
1334         .transpose()
1335         .context("failed to create balloon tube")?;
1336 
1337     let (vm_memory_handler_control, vm_memory_handler_control_for_thread) = Tube::pair()?;
1338     let vm_memory_handler_thread_join_handle = std::thread::Builder::new()
1339         .name("vm_memory_handler_thread".into())
1340         .spawn({
1341             let vm = guest_os.vm.try_clone().context("failed to clone Vm")?;
1342             let sys_allocator_mutex = sys_allocator_mutex.clone();
1343             move || {
1344                 vm_memory_handler_thread(
1345                     vm_memory_control_tubes,
1346                     vm,
1347                     sys_allocator_mutex,
1348                     gralloc,
1349                     vm_memory_handler_control_for_thread,
1350                 )
1351             }
1352         })
1353         .unwrap();
1354 
1355     if let Some(evt) = broker_shutdown_evt.as_ref() {
1356         wait_ctx.add(evt, Token::BrokerShutdown).exit_context(
1357             Exit::WaitContextAdd,
1358             "failed to add trigger to wait context",
1359         )?;
1360     }
1361 
1362     let mut control_tubes = BTreeMap::from_iter(control_tubes.into_iter().enumerate());
1363     let mut next_control_id = control_tubes.len();
1364     for (id, control_tube) in control_tubes.iter() {
1365         #[allow(clippy::single_match)]
1366         match control_tube {
1367             TaggedControlTube::Product(product_tube) => wait_ctx
1368                 .add(
1369                     product_tube.get_read_notifier(),
1370                     Token::VmControl { id: *id },
1371                 )
1372                 .exit_context(
1373                     Exit::WaitContextAdd,
1374                     "failed to add trigger to wait context",
1375                 )?,
1376             _ => (),
1377         }
1378     }
1379 
1380     let (device_ctrl_tube, device_ctrl_resp) = Tube::pair().context("failed to create tube")?;
1381     guest_os.devices_thread = match create_devices_worker_thread(
1382         guest_os.vm.get_memory().clone(),
1383         guest_os.io_bus.clone(),
1384         guest_os.mmio_bus.clone(),
1385         device_ctrl_resp,
1386     ) {
1387         Ok(join_handle) => Some(join_handle),
1388         Err(e) => {
1389             return Err(anyhow!("Failed to start devices thread: {}", e));
1390         }
1391     };
1392 
1393     let vcpus: Vec<Option<_>> = match guest_os.vcpus.take() {
1394         Some(vec) => vec.into_iter().map(|vcpu| Some(vcpu)).collect(),
1395         None => iter::repeat_with(|| None)
1396             .take(guest_os.vcpu_count)
1397             .collect(),
1398     };
1399 
1400     let anti_tamper_main_thread_tube = spawn_anti_tamper_thread(&wait_ctx);
1401 
1402     let mut vm_control_server = create_control_server(control_server_path, &wait_ctx)?;
1403 
1404     let ime_thread = run_ime_thread(&mut product_args, &exit_evt)?;
1405 
1406     let original_terminal_mode = stdin().set_raw_mode().ok();
1407 
1408     let vcpu_boxes: Arc<Mutex<Vec<Box<dyn VcpuArch>>>> = Arc::new(Mutex::new(Vec::new()));
1409     let run_mode_arc = Arc::new(VcpuRunMode::default());
1410 
1411     let run_mode_state = if suspended {
1412         // Sleep devices before creating vcpus.
1413         device_ctrl_tube
1414             .send(&DeviceControlCommand::SleepDevices)
1415             .context("send command to devices control socket")?;
1416         match device_ctrl_tube
1417             .recv()
1418             .context("receive from devices control socket")?
1419         {
1420             VmResponse::Ok => (),
1421             resp => bail!("device sleep failed: {}", resp),
1422         }
1423         run_mode_arc.set_and_notify(VmRunMode::Suspending);
1424         VmRunMode::Suspending
1425     } else {
1426         VmRunMode::Running
1427     };
1428 
1429     // If we are restoring from a snapshot, then start suspended.
1430     if restore_path.is_some() {
1431         run_mode_arc.set_and_notify(VmRunMode::Suspending);
1432     }
1433 
1434     let (vcpu_threads, vcpu_control_channels) = run_all_vcpus(
1435         vcpus,
1436         vcpu_boxes.clone(),
1437         &guest_os,
1438         &exit_evt,
1439         &vm_evt_wrtube,
1440         #[cfg(feature = "stats")]
1441         &stats,
1442         host_cpu_topology,
1443         run_mode_arc.clone(),
1444         tsc_sync_mitigations,
1445         force_calibrated_tsc_leaf,
1446     )?;
1447 
1448     // See comment on `VmRequest::execute`.
1449     let mut suspended_pvclock_state: Option<hypervisor::ClockState> = None;
1450 
1451     // Restore VM (if applicable).
1452     if let Some(path) = restore_path {
1453         vm_control::do_restore(
1454             &path,
1455             |msg| {
1456                 kick_all_vcpus(
1457                     run_mode_arc.as_ref(),
1458                     &vcpu_control_channels,
1459                     vcpu_boxes.as_ref(),
1460                     guest_os.irq_chip.as_ref(),
1461                     #[cfg(feature = "pvclock")]
1462                     &pvclock_host_tube,
1463                     &guest_os.resume_notify_devices,
1464                     msg,
1465                 )
1466             },
1467             |msg, index| {
1468                 kick_vcpu(
1469                     run_mode_arc.as_ref(),
1470                     &vcpu_control_channels,
1471                     vcpu_boxes.as_ref(),
1472                     guest_os.irq_chip.as_ref(),
1473                     index,
1474                     msg,
1475                 )
1476             },
1477             &irq_handler_control,
1478             &device_ctrl_tube,
1479             guest_os.vcpu_count,
1480             |image| {
1481                 guest_os
1482                     .irq_chip
1483                     .try_box_clone()?
1484                     .restore(image, guest_os.vcpu_count)
1485             },
1486             /* require_encrypted= */ false,
1487             &mut suspended_pvclock_state,
1488             &guest_os.vm,
1489         )?;
1490         // Allow the vCPUs to start for real.
1491         kick_all_vcpus(
1492             run_mode_arc.as_ref(),
1493             &vcpu_control_channels,
1494             vcpu_boxes.as_ref(),
1495             guest_os.irq_chip.as_ref(),
1496             #[cfg(feature = "pvclock")]
1497             &pvclock_host_tube,
1498             &guest_os.resume_notify_devices,
1499             // Other platforms (unix) have multiple modes they could start in (e.g. starting for
1500             // guest kernel debugging, etc). If/when we support those modes on Windows, we'll need
1501             // to enter that mode here rather than VmRunMode::Running.
1502             VcpuControl::RunState(run_mode_state),
1503         );
1504     }
1505 
1506     let mut exit_state = ExitState::Stop;
1507     let mut region_state: VmMemoryRegionState = Default::default();
1508 
1509     'poll: loop {
1510         let events = {
1511             match wait_ctx.wait() {
1512                 Ok(v) => v,
1513                 Err(e) => {
1514                     error!("failed to wait: {}", e);
1515                     break;
1516                 }
1517             }
1518         };
1519 
1520         let mut vm_control_ids_to_remove = Vec::new();
1521         for event in events.iter().filter(|e| e.is_readable) {
1522             let state = handle_readable_event(
1523                 event,
1524                 &mut vm_control_ids_to_remove,
1525                 &mut next_control_id,
1526                 &mut service_vm_state,
1527                 disk_host_tubes.as_slice(),
1528                 ipc_main_loop_tube.as_ref(),
1529                 #[cfg(feature = "gpu")]
1530                 gpu_control_tube.as_ref(),
1531                 &vm_evt_rdtube,
1532                 &mut control_tubes,
1533                 &mut guest_os,
1534                 &sys_allocator_mutex,
1535                 &mut virtio_snd_host_mute_tubes,
1536                 proto_main_loop_tube.as_ref(),
1537                 &anti_tamper_main_thread_tube,
1538                 #[cfg(feature = "balloon")]
1539                 balloon_tube.as_mut(),
1540                 memory_size_mb,
1541                 vcpu_boxes.as_ref(),
1542                 #[cfg(feature = "pvclock")]
1543                 &pvclock_host_tube,
1544                 run_mode_arc.as_ref(),
1545                 &mut region_state,
1546                 vm_control_server.as_mut(),
1547                 &irq_handler_control,
1548                 &device_ctrl_tube,
1549                 &wait_ctx,
1550                 force_s2idle,
1551                 &vcpu_control_channels,
1552                 &mut suspended_pvclock_state,
1553             )?;
1554             if let Some(state) = state {
1555                 exit_state = state;
1556                 break 'poll;
1557             }
1558         }
1559 
1560         remove_closed_tubes(&wait_ctx, &mut control_tubes, vm_control_ids_to_remove)?;
1561     }
1562 
1563     info!("run_control poll loop completed, forcing vCPUs to exit...");
1564 
1565     // VCPU threads MUST see the VmRunMode flag, otherwise they may re-enter the VM.
1566     run_mode_arc.set_and_notify(VmRunMode::Exiting);
1567 
1568     // Force all vcpus to exit from the hypervisor
1569     for vcpu in vcpu_boxes.lock().iter() {
1570         vcpu.set_immediate_exit(true);
1571     }
1572 
1573     let mut res = Ok(exit_state);
1574     guest_os.irq_chip.kick_halted_vcpus();
1575     let _ = exit_evt.signal();
1576 
1577     if guest_os.devices_thread.is_some() {
1578         if let Err(e) = device_ctrl_tube.send(&DeviceControlCommand::Exit) {
1579             error!("failed to stop device control loop: {}", e);
1580         };
1581         if let Some(thread) = guest_os.devices_thread.take() {
1582             if let Err(e) = thread.join() {
1583                 error!("failed to exit devices thread: {:?}", e);
1584             }
1585         }
1586     }
1587 
1588     // Shut down the VM memory handler thread.
1589     if let Err(e) = vm_memory_handler_control.send(&VmMemoryHandlerRequest::Exit) {
1590         error!(
1591             "failed to request exit from VM memory handler thread: {}",
1592             e
1593         );
1594     }
1595     if let Err(e) = vm_memory_handler_thread_join_handle.join() {
1596         error!("failed to exit VM Memory handler thread: {:?}", e);
1597     }
1598 
1599     // Shut down the IRQ handler thread.
1600     if let Err(e) = irq_handler_control.send(&IrqHandlerRequest::Exit) {
1601         error!("failed to request exit from IRQ handler thread: {}", e);
1602     }
1603 
1604     // Ensure any child threads have ended by sending the Exit vm event (possibly again) to ensure
1605     // their run loops are aborted.
1606     let _ = vm_evt_wrtube.send::<VmEventType>(&VmEventType::Exit);
1607     for (i, thread) in vcpu_threads.into_iter().enumerate() {
1608         // wait till all the threads exit, so that guest_os.vm arc memory count is down to 1.
1609         // otherwise, we will hit a memory leak if we force kill the thread with terminate.
1610         match thread.join() {
1611             Ok(Err(e)) => {
1612                 error!("vcpu thread {} exited with an error: {}", i, e);
1613                 res = Err(e);
1614             }
1615             Ok(_) => {}
1616             Err(e) => error!("vcpu thread {} panicked: {:?}", i, e),
1617         }
1618     }
1619 
1620     info!("vCPU threads have exited.");
1621 
1622     if let Some(ime) = ime_thread {
1623         match ime.join() {
1624             Ok(Err(e)) => {
1625                 error!("ime thread exited with an error: {}", e);
1626                 if res.is_ok() {
1627                     // Prioritize past errors, but return this error if it is unique, otherwise just
1628                     // log it.
1629                     res = Err(e)
1630                 }
1631             }
1632             Ok(_) => {}
1633             Err(e) => error!("ime thread panicked: {:?}", e),
1634         }
1635     }
1636     info!("IME thread has exited.");
1637 
1638     // This cancels all the outstanding and any future blocking operations.
1639     // TODO(b/196911556): Shutdown executor for cleaner shutdown. Given we are using global, for a
1640     // cleaner shutdown we have to call disarm so that all the incoming requests are run and are
1641     // cancelled. If we call shutdown all blocking threads will go away and incoming operations
1642     // won't be scheduled to run and will be dropped leading to panic. I think ideal place to call
1643     // shutdown is when we drop non-global executor.
1644     cros_async::unblock_disarm();
1645     info!("blocking async pool has shut down.");
1646 
1647     let _ = irq_join_handle.join();
1648     info!("IrqWaitWorker has shut down.");
1649 
1650     #[cfg(feature = "stats")]
1651     if let Some(stats) = stats {
1652         println!("Statistics Collected:\n{}", stats.lock());
1653         println!("Statistics JSON:\n{}", stats.lock().json());
1654     }
1655 
1656     if let Some(mode) = original_terminal_mode {
1657         if let Err(e) = stdin().restore_mode(mode) {
1658             warn!("failed to restore terminal mode: {}", e);
1659         }
1660     }
1661 
1662     // Explicitly drop the VM structure here to allow the devices to clean up before the
1663     // control tubes are closed when this function exits.
1664     mem::drop(guest_os);
1665 
1666     info!("guest_os dropped, run_control is done.");
1667 
1668     res
1669 }
1670 
1671 /// Remove Tubes that have been closed from the WaitContext.
remove_closed_tubes<T, U>( wait_ctx: &WaitContext<T>, tubes: &mut BTreeMap<usize, U>, mut tube_ids_to_remove: Vec<usize>, ) -> anyhow::Result<()> where T: EventToken, U: ReadNotifier + CloseNotifier,1672 fn remove_closed_tubes<T, U>(
1673     wait_ctx: &WaitContext<T>,
1674     tubes: &mut BTreeMap<usize, U>,
1675     mut tube_ids_to_remove: Vec<usize>,
1676 ) -> anyhow::Result<()>
1677 where
1678     T: EventToken,
1679     U: ReadNotifier + CloseNotifier,
1680 {
1681     tube_ids_to_remove.dedup();
1682     for id in tube_ids_to_remove {
1683         if let Some(socket) = tubes.remove(&id) {
1684             wait_ctx
1685                 .delete(socket.get_read_notifier())
1686                 .context("failed to remove descriptor from wait context")?;
1687 
1688             // There may be a close notifier registered for this Tube. If there isn't one
1689             // registered, we just ignore the error.
1690             let _ = wait_ctx.delete(socket.get_close_notifier());
1691         }
1692     }
1693     Ok(())
1694 }
1695 
1696 /// Sends a message to all VCPUs.
kick_all_vcpus( run_mode: &VcpuRunMode, vcpu_control_channels: &[mpsc::Sender<VcpuControl>], vcpu_boxes: &Mutex<Vec<Box<dyn VcpuArch>>>, irq_chip: &dyn IrqChipArch, #[cfg(feature = "pvclock")] pvclock_host_tube: &Option<Tube>, resume_notify_devices: &[Arc<Mutex<dyn BusResumeDevice>>], msg: VcpuControl, )1697 fn kick_all_vcpus(
1698     run_mode: &VcpuRunMode,
1699     vcpu_control_channels: &[mpsc::Sender<VcpuControl>],
1700     vcpu_boxes: &Mutex<Vec<Box<dyn VcpuArch>>>,
1701     irq_chip: &dyn IrqChipArch,
1702     #[cfg(feature = "pvclock")] pvclock_host_tube: &Option<Tube>,
1703     resume_notify_devices: &[Arc<Mutex<dyn BusResumeDevice>>],
1704     msg: VcpuControl,
1705 ) {
1706     // On Windows, we handle run mode switching directly rather than delegating to the VCPU thread
1707     // like unix does.
1708     match &msg {
1709         VcpuControl::RunState(VmRunMode::Suspending) => {
1710             suspend_all_vcpus(
1711                 run_mode,
1712                 vcpu_boxes,
1713                 irq_chip,
1714                 #[cfg(feature = "pvclock")]
1715                 pvclock_host_tube,
1716             );
1717             return;
1718         }
1719         VcpuControl::RunState(VmRunMode::Running) => {
1720             for device in resume_notify_devices {
1721                 device.lock().resume_imminent();
1722             }
1723             resume_all_vcpus(
1724                 run_mode,
1725                 vcpu_boxes,
1726                 irq_chip,
1727                 #[cfg(feature = "pvclock")]
1728                 pvclock_host_tube,
1729             );
1730             return;
1731         }
1732         _ => (),
1733     }
1734 
1735     // For non RunState commands, we dispatch just like unix would.
1736     for vcpu in vcpu_control_channels {
1737         if let Err(e) = vcpu.send(msg.clone()) {
1738             error!("failed to send VcpuControl message: {}", e);
1739         }
1740     }
1741 
1742     // Now that we've sent a message, we need VCPUs to exit so they can process it.
1743     for vcpu in vcpu_boxes.lock().iter() {
1744         vcpu.set_immediate_exit(true);
1745     }
1746     irq_chip.kick_halted_vcpus();
1747 
1748     // If the VCPU isn't running, we have to notify the run_mode condvar to wake it so it processes
1749     // the control message.
1750     let current_run_mode = run_mode.get_mode();
1751     if current_run_mode != VmRunMode::Running {
1752         run_mode.set_and_notify(current_run_mode);
1753     }
1754 }
1755 
1756 /// Sends a message to a single VCPU. On Windows, `VcpuControl::RunState` cannot be sent to a single
1757 /// VCPU.
kick_vcpu( run_mode: &VcpuRunMode, vcpu_control_channels: &[mpsc::Sender<VcpuControl>], vcpu_boxes: &Mutex<Vec<Box<dyn VcpuArch>>>, irq_chip: &dyn IrqChipArch, index: usize, msg: VcpuControl, )1758 fn kick_vcpu(
1759     run_mode: &VcpuRunMode,
1760     vcpu_control_channels: &[mpsc::Sender<VcpuControl>],
1761     vcpu_boxes: &Mutex<Vec<Box<dyn VcpuArch>>>,
1762     irq_chip: &dyn IrqChipArch,
1763     index: usize,
1764     msg: VcpuControl,
1765 ) {
1766     assert!(
1767         !matches!(msg, VcpuControl::RunState(_)),
1768         "Windows does not support RunState changes on a per VCPU basis"
1769     );
1770 
1771     let vcpu = vcpu_control_channels
1772         .get(index)
1773         .expect("invalid vcpu index specified");
1774     if let Err(e) = vcpu.send(msg) {
1775         error!("failed to send VcpuControl message: {}", e);
1776     }
1777 
1778     // Now that we've sent a message, we need the VCPU to exit so it can
1779     // process the message.
1780     vcpu_boxes
1781         .lock()
1782         .get(index)
1783         .expect("invalid vcpu index specified")
1784         .set_immediate_exit(true);
1785     irq_chip.kick_halted_vcpus();
1786 
1787     // If the VCPU isn't running, we have to notify the run_mode condvar to wake it so it processes
1788     // the control message. (Technically this wakes all VCPUs, but those without messages will go
1789     // back to sleep.)
1790     let current_run_mode = run_mode.get_mode();
1791     if current_run_mode != VmRunMode::Running {
1792         run_mode.set_and_notify(current_run_mode);
1793     }
1794 }
1795 
1796 /// Suspends all VCPUs. The VM will be effectively frozen in time once this function is called,
1797 /// though devices on the host will continue to run.
suspend_all_vcpus( run_mode: &VcpuRunMode, vcpu_boxes: &Mutex<Vec<Box<dyn VcpuArch>>>, irq_chip: &dyn IrqChipArch, #[cfg(feature = "pvclock")] pvclock_host_tube: &Option<Tube>, )1798 pub(crate) fn suspend_all_vcpus(
1799     run_mode: &VcpuRunMode,
1800     vcpu_boxes: &Mutex<Vec<Box<dyn VcpuArch>>>,
1801     irq_chip: &dyn IrqChipArch,
1802     #[cfg(feature = "pvclock")] pvclock_host_tube: &Option<Tube>,
1803 ) {
1804     // VCPU threads MUST see the VmRunMode::Suspending flag first, otherwise
1805     // they may re-enter the VM.
1806     run_mode.set_and_notify(VmRunMode::Suspending);
1807 
1808     // Force all vcpus to exit from the hypervisor
1809     for vcpu in vcpu_boxes.lock().iter() {
1810         vcpu.set_immediate_exit(true);
1811     }
1812     irq_chip.kick_halted_vcpus();
1813 
1814     #[cfg(feature = "pvclock")]
1815     handle_pvclock_request(pvclock_host_tube, PvClockCommand::Suspend)
1816         .unwrap_or_else(|e| error!("Error handling pvclock suspend: {:?}", e));
1817 }
1818 
1819 /// Resumes all VCPUs.
resume_all_vcpus( run_mode: &VcpuRunMode, vcpu_boxes: &Mutex<Vec<Box<dyn VcpuArch>>>, irq_chip: &dyn IrqChipArch, #[cfg(feature = "pvclock")] pvclock_host_tube: &Option<Tube>, )1820 pub(crate) fn resume_all_vcpus(
1821     run_mode: &VcpuRunMode,
1822     vcpu_boxes: &Mutex<Vec<Box<dyn VcpuArch>>>,
1823     irq_chip: &dyn IrqChipArch,
1824     #[cfg(feature = "pvclock")] pvclock_host_tube: &Option<Tube>,
1825 ) {
1826     #[cfg(feature = "pvclock")]
1827     handle_pvclock_request(pvclock_host_tube, PvClockCommand::Resume)
1828         .unwrap_or_else(|e| error!("Error handling pvclock resume: {:?}", e));
1829 
1830     // Make sure any immediate exit bits are disabled
1831     for vcpu in vcpu_boxes.lock().iter() {
1832         vcpu.set_immediate_exit(false);
1833     }
1834 
1835     run_mode.set_and_notify(VmRunMode::Running);
1836 }
1837 
1838 #[cfg(feature = "gvm")]
1839 const GVM_MINIMUM_VERSION: GvmVersion = GvmVersion {
1840     major: 1,
1841     minor: 4,
1842     patch: 1,
1843 };
1844 
1845 #[cfg(feature = "gvm")]
create_gvm_vm(gvm: Gvm, mem: GuestMemory) -> Result<GvmVm>1846 fn create_gvm_vm(gvm: Gvm, mem: GuestMemory) -> Result<GvmVm> {
1847     match gvm.get_full_version() {
1848         Ok(version) => {
1849             if version < GVM_MINIMUM_VERSION {
1850                 error!(
1851                     "GVM version {} is below minimum version {}",
1852                     version, GVM_MINIMUM_VERSION
1853                 );
1854                 return Err(base::Error::new(libc::ENXIO).into());
1855             } else {
1856                 info!("Using GVM version {}.", version)
1857             }
1858         }
1859         Err(e) => {
1860             error!("unable to determine gvm version: {}", e);
1861             return Err(base::Error::new(libc::ENXIO).into());
1862         }
1863     }
1864     let vm = GvmVm::new(&gvm, mem)?;
1865     Ok(vm)
1866 }
1867 
1868 #[cfg(feature = "haxm")]
create_haxm_vm( haxm: Haxm, mem: GuestMemory, kernel_log_file: &Option<String>, ) -> Result<HaxmVm>1869 fn create_haxm_vm(
1870     haxm: Haxm,
1871     mem: GuestMemory,
1872     kernel_log_file: &Option<String>,
1873 ) -> Result<HaxmVm> {
1874     let vm = HaxmVm::new(&haxm, mem)?;
1875     if let Some(path) = kernel_log_file {
1876         use hypervisor::haxm::HAX_CAP_VM_LOG;
1877         if vm.check_raw_capability(HAX_CAP_VM_LOG) {
1878             match vm.register_log_file(path) {
1879                 Ok(_) => {}
1880                 Err(e) => match e.errno() {
1881                     libc::E2BIG => {
1882                         error!(
1883                             "kernel_log_file path is too long, kernel log file will not be written"
1884                         );
1885                     }
1886                     _ => return Err(e.into()),
1887                 },
1888             }
1889         } else {
1890             warn!(
1891                 "kernel_log_file specified but this version of HAXM does not support kernel log \
1892                   files"
1893             );
1894         }
1895     }
1896     Ok(vm)
1897 }
1898 
1899 #[cfg(feature = "whpx")]
1900 #[cfg(target_arch = "x86_64")]
create_whpx_vm( whpx: Whpx, mem: GuestMemory, cpu_count: usize, no_smt: bool, apic_emulation: bool, force_calibrated_tsc_leaf: bool, vm_evt_wrtube: SendTube, ) -> Result<WhpxVm>1901 fn create_whpx_vm(
1902     whpx: Whpx,
1903     mem: GuestMemory,
1904     cpu_count: usize,
1905     no_smt: bool,
1906     apic_emulation: bool,
1907     force_calibrated_tsc_leaf: bool,
1908     vm_evt_wrtube: SendTube,
1909 ) -> Result<WhpxVm> {
1910     let cpu_config = hypervisor::CpuConfigX86_64::new(
1911         force_calibrated_tsc_leaf,
1912         false, /* host_cpu_topology */
1913         false, /* enable_hwp */
1914         no_smt,
1915         false, /* itmt */
1916         None,  /* hybrid_type */
1917     );
1918 
1919     // context for non-cpu-specific cpuid results
1920     let ctx = CpuIdContext::new(
1921         0,
1922         cpu_count,
1923         None,
1924         cpu_config,
1925         whpx.check_capability(HypervisorCap::CalibratedTscLeafRequired),
1926         __cpuid_count,
1927         __cpuid,
1928     );
1929 
1930     // Get all cpuid entries that we should pre-set
1931     let mut cpuid = whpx.get_supported_cpuid()?;
1932 
1933     // Adjust them for crosvm
1934     for entry in cpuid.cpu_id_entries.iter_mut() {
1935         adjust_cpuid(entry, &ctx);
1936     }
1937 
1938     let vm = WhpxVm::new(
1939         &whpx,
1940         cpu_count,
1941         mem,
1942         cpuid,
1943         apic_emulation,
1944         Some(vm_evt_wrtube),
1945     )
1946     .exit_context(Exit::WhpxSetupError, "failed to create WHPX vm")?;
1947 
1948     Ok(vm)
1949 }
1950 
1951 #[cfg(feature = "gvm")]
create_gvm_irq_chip(vm: &GvmVm, vcpu_count: usize) -> base::Result<GvmIrqChip>1952 fn create_gvm_irq_chip(vm: &GvmVm, vcpu_count: usize) -> base::Result<GvmIrqChip> {
1953     info!("Creating GVM irqchip");
1954     let irq_chip = GvmIrqChip::new(vm.try_clone()?, vcpu_count)?;
1955     Ok(irq_chip)
1956 }
1957 
1958 #[cfg(feature = "whpx")]
1959 #[cfg(target_arch = "x86_64")]
create_whpx_split_irq_chip( vm: &WhpxVm, ioapic_device_tube: Tube, ) -> base::Result<WhpxSplitIrqChip>1960 fn create_whpx_split_irq_chip(
1961     vm: &WhpxVm,
1962     ioapic_device_tube: Tube,
1963 ) -> base::Result<WhpxSplitIrqChip> {
1964     info!("Creating WHPX split irqchip");
1965     WhpxSplitIrqChip::new(
1966         vm.try_clone()?,
1967         ioapic_device_tube,
1968         None, // ioapic_pins
1969     )
1970 }
1971 
create_userspace_irq_chip<Vcpu>( vcpu_count: usize, ioapic_device_tube: Tube, ) -> base::Result<UserspaceIrqChip<Vcpu>> where Vcpu: VcpuArch + 'static,1972 fn create_userspace_irq_chip<Vcpu>(
1973     vcpu_count: usize,
1974     ioapic_device_tube: Tube,
1975 ) -> base::Result<UserspaceIrqChip<Vcpu>>
1976 where
1977     Vcpu: VcpuArch + 'static,
1978 {
1979     info!("Creating userspace irqchip");
1980     let irq_chip =
1981         UserspaceIrqChip::new(vcpu_count, ioapic_device_tube, /* ioapic_pins: */ None)?;
1982     Ok(irq_chip)
1983 }
1984 
get_default_hypervisor() -> Option<HypervisorKind>1985 pub fn get_default_hypervisor() -> Option<HypervisorKind> {
1986     // The ordering here matters from most preferable to the least.
1987     #[cfg(feature = "whpx")]
1988     match hypervisor::whpx::Whpx::is_enabled() {
1989         true => return Some(HypervisorKind::Whpx),
1990         false => warn!("Whpx not enabled."),
1991     };
1992 
1993     #[cfg(feature = "haxm")]
1994     match Haxm::new() {
1995         Ok(_) => return Some(HypervisorKind::Ghaxm),
1996         Err(e) => warn!("Cannot initialize HAXM: {}", e),
1997     };
1998 
1999     #[cfg(feature = "gvm")]
2000     // Make sure Gvm device can be opened before selecting it.
2001     match Gvm::new() {
2002         Ok(_) => return Some(HypervisorKind::Gvm),
2003         Err(e) => warn!("Cannot initialize GVM: {}", e),
2004     };
2005 
2006     None
2007 }
2008 
setup_vm_components(cfg: &Config) -> Result<VmComponents>2009 fn setup_vm_components(cfg: &Config) -> Result<VmComponents> {
2010     let initrd_image = if let Some(initrd_path) = &cfg.initrd_path {
2011         Some(
2012             File::open(initrd_path).with_exit_context(Exit::OpenInitrd, || {
2013                 format!("failed to open initrd {}", initrd_path.display())
2014             })?,
2015         )
2016     } else {
2017         None
2018     };
2019 
2020     let vm_image = match cfg.executable_path {
2021         Some(Executable::Kernel(ref kernel_path)) => VmImage::Kernel(
2022             File::open(kernel_path).with_exit_context(Exit::OpenKernel, || {
2023                 format!("failed to open kernel image {}", kernel_path.display(),)
2024             })?,
2025         ),
2026         Some(Executable::Bios(ref bios_path)) => {
2027             VmImage::Bios(File::open(bios_path).with_exit_context(Exit::OpenBios, || {
2028                 format!("failed to open bios {}", bios_path.display())
2029             })?)
2030         }
2031         _ => panic!("Did not receive a bios or kernel, should be impossible."),
2032     };
2033 
2034     let swiotlb = if let Some(size) = cfg.swiotlb {
2035         Some(
2036             size.checked_mul(1024 * 1024)
2037                 .ok_or_else(|| anyhow!("requested swiotlb size too large"))?,
2038         )
2039     } else if matches!(cfg.protection_type, ProtectionType::Unprotected) {
2040         None
2041     } else {
2042         Some(64 * 1024 * 1024)
2043     };
2044 
2045     let (pflash_image, pflash_block_size) = if let Some(pflash_parameters) = &cfg.pflash_parameters
2046     {
2047         (
2048             Some(
2049                 open_file_or_duplicate(
2050                     &pflash_parameters.path,
2051                     OpenOptions::new().read(true).write(true),
2052                 )
2053                 .with_context(|| {
2054                     format!("failed to open pflash {}", pflash_parameters.path.display())
2055                 })?,
2056             ),
2057             pflash_parameters.block_size,
2058         )
2059     } else {
2060         (None, 0)
2061     };
2062 
2063     Ok(VmComponents {
2064         memory_size: cfg
2065             .memory
2066             .unwrap_or(256)
2067             .checked_mul(1024 * 1024)
2068             .ok_or_else(|| anyhow!("requested memory size too large"))?,
2069         swiotlb,
2070         vcpu_count: cfg.vcpu_count.unwrap_or(1),
2071         fw_cfg_enable: false,
2072         bootorder_fw_cfg_blob: Vec::new(),
2073         vcpu_affinity: cfg.vcpu_affinity.clone(),
2074         cpu_clusters: cfg.cpu_clusters.clone(),
2075         cpu_capacity: cfg.cpu_capacity.clone(),
2076         no_smt: cfg.no_smt,
2077         hugepages: cfg.hugepages,
2078         hv_cfg: hypervisor::Config {
2079             protection_type: cfg.protection_type,
2080         },
2081         vm_image,
2082         android_fstab: cfg
2083             .android_fstab
2084             .as_ref()
2085             .map(|x| {
2086                 File::open(x).with_exit_context(Exit::OpenAndroidFstab, || {
2087                     format!("failed to open android fstab file {}", x.display())
2088                 })
2089             })
2090             .map_or(Ok(None), |v| v.map(Some))?,
2091         pstore: cfg.pstore.clone(),
2092         pflash_block_size,
2093         pflash_image,
2094         initrd_image,
2095         extra_kernel_params: cfg.params.clone(),
2096         acpi_sdts: cfg
2097             .acpi_tables
2098             .iter()
2099             .map(|path| {
2100                 SDT::from_file(path).with_exit_context(Exit::OpenAcpiTable, || {
2101                     format!("failed to open ACPI file {}", path.display())
2102                 })
2103             })
2104             .collect::<Result<Vec<SDT>>>()?,
2105         rt_cpus: cfg.rt_cpus.clone(),
2106         delay_rt: cfg.delay_rt,
2107         no_i8042: cfg.no_i8042,
2108         no_rtc: cfg.no_rtc,
2109         host_cpu_topology: cfg.host_cpu_topology,
2110         #[cfg(target_arch = "x86_64")]
2111         force_s2idle: cfg.force_s2idle,
2112         fw_cfg_parameters: cfg.fw_cfg_parameters.clone(),
2113         itmt: false,
2114         pvm_fw: None,
2115         pci_config: cfg.pci_config,
2116         #[cfg(target_arch = "x86_64")]
2117         smbios: cfg.smbios.clone(),
2118         dynamic_power_coefficient: cfg.dynamic_power_coefficient.clone(),
2119         #[cfg(target_arch = "x86_64")]
2120         break_linux_pci_config_io: cfg.break_linux_pci_config_io,
2121         boot_cpu: cfg.boot_cpu,
2122     })
2123 }
2124 
2125 // Enum that allows us to assign a variable to what is essentially a &dyn IrqChipArch.
2126 enum WindowsIrqChip<V: VcpuArch> {
2127     Userspace(UserspaceIrqChip<V>),
2128     #[cfg(feature = "gvm")]
2129     Gvm(GvmIrqChip),
2130     #[cfg(feature = "whpx")]
2131     WhpxSplit(WhpxSplitIrqChip),
2132 }
2133 
2134 impl<V: VcpuArch> WindowsIrqChip<V> {
2135     // Convert our enum to a &mut dyn IrqChipArch
as_mut(&mut self) -> &mut dyn IrqChipArch2136     fn as_mut(&mut self) -> &mut dyn IrqChipArch {
2137         match self {
2138             WindowsIrqChip::Userspace(i) => i,
2139             #[cfg(feature = "gvm")]
2140             WindowsIrqChip::Gvm(i) => i,
2141             #[cfg(feature = "whpx")]
2142             WindowsIrqChip::WhpxSplit(i) => i,
2143         }
2144     }
2145 }
2146 
2147 /// Storage for the VM TSC offset for each vcpu. Stored in a static because the tracing thread will
2148 /// need access to it when tracing is enabled.
2149 static TSC_OFFSETS: sync::Mutex<Vec<Option<u64>>> = sync::Mutex::new(Vec::new());
2150 
2151 /// Save the TSC offset for a particular vcpu.
2152 ///
2153 /// After setting the TSC offset for a vcpu, this function checks the standard deviation of offsets
2154 /// for all the VCPUs and logs this information. If the TSC offsets differ too much between vcpus
2155 /// it can cause clock issues in the guest.
save_vcpu_tsc_offset(offset: u64, vcpu_id: usize)2156 pub fn save_vcpu_tsc_offset(offset: u64, vcpu_id: usize) {
2157     let offsets_copy = {
2158         let mut offsets = TSC_OFFSETS.lock();
2159         // make sure offsets vec is large enough before inserting
2160         let newlen = std::cmp::max(offsets.len(), vcpu_id + 1);
2161         offsets.resize(newlen, None);
2162         offsets[vcpu_id] = Some(offset);
2163 
2164         offsets.clone()
2165     };
2166 
2167     // do statistics on a clone of the offsets so we don't hold up other vcpus at this point
2168     info!(
2169         "TSC offset standard deviation is: {}",
2170         standard_deviation(
2171             &offsets_copy
2172                 .iter()
2173                 .filter(|x| x.is_some())
2174                 .map(|x| x.unwrap() as u128)
2175                 .collect::<Vec<u128>>()
2176         )
2177     );
2178 }
2179 
2180 /// Get the TSC offset of any vcpu. It will pick the first non-None offset it finds in TSC_OFFSETS.
2181 #[cfg(feature = "perfetto")]
get_vcpu_tsc_offset() -> u642182 pub fn get_vcpu_tsc_offset() -> u64 {
2183     if let Some(offset) = TSC_OFFSETS.lock().iter().flatten().next() {
2184         return *offset;
2185     }
2186     0
2187 }
2188 
2189 /// Callback that is registered with tracing crate, and will be called by the tracing thread when
2190 /// tracing is enabled or disabled. Regardless of whether tracing is being enabled or disabled for
2191 /// a given category or instance, we just emit a clock snapshot that maps the guest TSC to the
2192 /// host TSC. Redundant snapshots should not be a problem for perfetto.
2193 #[cfg(feature = "perfetto")]
set_tsc_clock_snapshot()2194 fn set_tsc_clock_snapshot() {
2195     let freq = match devices::tsc::tsc_frequency() {
2196         Err(e) => {
2197             error!(
2198                 "Could not determine tsc frequency, unable to snapshot tsc offset: {}",
2199                 e
2200             );
2201             return;
2202         }
2203         Ok(freq) => freq,
2204     };
2205 
2206     // The offset is host-guest tsc value
2207     let offset = get_vcpu_tsc_offset();
2208     // Safe because _rdtsc takes no arguments;
2209     let host_tsc = unsafe { std::arch::x86_64::_rdtsc() };
2210     perfetto::snapshot_clock(perfetto::ClockSnapshot::new(
2211         // Technically our multiplier should be freq/1_000_000_000, but perfetto doesn't
2212         // support floating point multipliers yet. So for now we set the freq in Hz and rely
2213         // on the merge tool to fix it.
2214         perfetto::Clock::new(
2215             perfetto::BuiltinClock::Tsc as u32,
2216             host_tsc.wrapping_add(offset),
2217         )
2218         .set_multiplier(freq as u64),
2219         perfetto::Clock::new(
2220             // The host builtin clock ids are all offset from the guest ids by
2221             // HOST_GUEST_CLOCK_ID_OFFSET when the traces are merged. Because this snapshot
2222             // contains both a guest and host clock, we need to offset it before merge.
2223             perfetto::BuiltinClock::Tsc as u32 + cros_tracing::HOST_GUEST_CLOCK_ID_OFFSET,
2224             host_tsc,
2225         )
2226         .set_multiplier(freq as u64),
2227     ));
2228 }
2229 
2230 /// Launches run_config for the broker, reading configuration from a TubeTransporter.
run_config_for_broker(raw_tube_transporter: RawDescriptor) -> Result<ExitState>2231 pub fn run_config_for_broker(raw_tube_transporter: RawDescriptor) -> Result<ExitState> {
2232     let tube_transporter =
2233         // SAFETY:
2234         // Safe because we know that raw_transport_tube is valid (passed by inheritance), and that
2235         // the blocking & framing modes are accurate because we create them ourselves in the broker.
2236         unsafe { TubeTransporterReader::from_raw_descriptor(raw_tube_transporter) };
2237 
2238     let mut tube_data_list = tube_transporter
2239         .read_tubes()
2240         .exit_context(Exit::TubeTransporterInit, "failed to init tube transporter")?;
2241 
2242     let bootstrap_tube = tube_data_list
2243         .get_tube(TubeToken::Bootstrap)
2244         .exit_context(Exit::TubeFailure, "failed to get bootstrap tube")?;
2245 
2246     let mut cfg: Config = bootstrap_tube
2247         .recv::<Config>()
2248         .exit_context(Exit::TubeFailure, "failed to read bootstrap tube")?;
2249 
2250     let startup_args: CommonChildStartupArgs = bootstrap_tube
2251         .recv::<CommonChildStartupArgs>()
2252         .exit_context(Exit::TubeFailure, "failed to read bootstrap tube")?;
2253     let _child_cleanup = common_child_setup(startup_args).exit_context(
2254         Exit::CommonChildSetupError,
2255         "failed to perform common child setup",
2256     )?;
2257 
2258     cfg.broker_shutdown_event = Some(
2259         bootstrap_tube
2260             .recv::<Event>()
2261             .exit_context(Exit::TubeFailure, "failed to read bootstrap tube")?,
2262     );
2263     #[cfg(feature = "crash-report")]
2264     let crash_tube_map = bootstrap_tube
2265         .recv::<HashMap<ProcessType, Vec<SendTube>>>()
2266         .exit_context(Exit::TubeFailure, "failed to read bootstrap tube")?;
2267     #[cfg(feature = "crash-report")]
2268     crash_report::set_crash_tube_map(crash_tube_map);
2269 
2270     let BrokerTubes {
2271         vm_evt_wrtube,
2272         vm_evt_rdtube,
2273     } = bootstrap_tube
2274         .recv::<BrokerTubes>()
2275         .exit_context(Exit::TubeFailure, "failed to read bootstrap tube")?;
2276 
2277     run_config_inner(cfg, vm_evt_wrtube, vm_evt_rdtube)
2278 }
2279 
run_config(cfg: Config) -> Result<ExitState>2280 pub fn run_config(cfg: Config) -> Result<ExitState> {
2281     let _raise_timer_resolution = enable_high_res_timers()
2282         .exit_context(Exit::EnableHighResTimer, "failed to enable high res timer")?;
2283 
2284     // There is no broker when using run_config(), so the vm_evt tubes need to be created.
2285     let (vm_evt_wrtube, vm_evt_rdtube) =
2286         Tube::directional_pair().context("failed to create vm event tube")?;
2287 
2288     run_config_inner(cfg, vm_evt_wrtube, vm_evt_rdtube)
2289 }
2290 
create_guest_memory( components: &VmComponents, arch_memory_layout: &<Arch as LinuxArch>::ArchMemoryLayout, hypervisor: &impl Hypervisor, ) -> Result<GuestMemory>2291 fn create_guest_memory(
2292     components: &VmComponents,
2293     arch_memory_layout: &<Arch as LinuxArch>::ArchMemoryLayout,
2294     hypervisor: &impl Hypervisor,
2295 ) -> Result<GuestMemory> {
2296     let guest_mem_layout = Arch::guest_memory_layout(components, arch_memory_layout, hypervisor)
2297         .exit_context(
2298             Exit::GuestMemoryLayout,
2299             "failed to create guest memory layout",
2300         )?;
2301     GuestMemory::new_with_options(&guest_mem_layout)
2302         .exit_context(Exit::CreateGuestMemory, "failed to create guest memory")
2303 }
2304 
run_config_inner( cfg: Config, vm_evt_wrtube: SendTube, vm_evt_rdtube: RecvTube, ) -> Result<ExitState>2305 fn run_config_inner(
2306     cfg: Config,
2307     vm_evt_wrtube: SendTube,
2308     vm_evt_rdtube: RecvTube,
2309 ) -> Result<ExitState> {
2310     product::setup_common_metric_invariants(&cfg);
2311 
2312     #[cfg(feature = "perfetto")]
2313     cros_tracing::add_per_trace_callback(set_tsc_clock_snapshot);
2314 
2315     let components: VmComponents = setup_vm_components(&cfg)?;
2316     let arch_memory_layout = Arch::arch_memory_layout(&components)?;
2317 
2318     #[allow(unused_mut)]
2319     let mut hypervisor = cfg
2320         .hypervisor
2321         .or_else(get_default_hypervisor)
2322         .exit_context(Exit::NoDefaultHypervisor, "no enabled hypervisor")?;
2323 
2324     #[cfg(feature = "whpx")]
2325     if hypervisor::whpx::Whpx::is_enabled() {
2326         // If WHPX is enabled, no other hypervisor can be used, so just override it
2327         hypervisor = HypervisorKind::Whpx;
2328     }
2329 
2330     match hypervisor {
2331         #[cfg(feature = "haxm")]
2332         HypervisorKind::Haxm | HypervisorKind::Ghaxm => {
2333             if hypervisor == HypervisorKind::Haxm {
2334                 set_use_ghaxm(false);
2335             }
2336             info!("Creating HAXM ghaxm={}", get_use_ghaxm());
2337             let haxm = Haxm::new()?;
2338             let guest_mem = create_guest_memory(&components, &arch_memory_layout, &haxm)?;
2339             let vm = create_haxm_vm(haxm, guest_mem, &cfg.kernel_log_file)?;
2340             let (ioapic_host_tube, ioapic_device_tube) =
2341                 Tube::pair().exit_context(Exit::CreateTube, "failed to create tube")?;
2342             let irq_chip =
2343                 create_userspace_irq_chip::<HaxmVcpu>(components.vcpu_count, ioapic_device_tube)?;
2344             run_vm::<HaxmVcpu, HaxmVm>(
2345                 cfg,
2346                 components,
2347                 &arch_memory_layout,
2348                 vm,
2349                 WindowsIrqChip::Userspace(irq_chip).as_mut(),
2350                 Some(ioapic_host_tube),
2351                 vm_evt_wrtube,
2352                 vm_evt_rdtube,
2353             )
2354         }
2355         #[cfg(feature = "whpx")]
2356         HypervisorKind::Whpx => {
2357             let apic_emulation_supported =
2358                 Whpx::check_whpx_feature(WhpxFeature::LocalApicEmulation)
2359                     .exit_context(Exit::WhpxSetupError, "failed to set up whpx")?;
2360 
2361             let no_smt = cfg.no_smt;
2362 
2363             // Default to WhpxSplitIrqChip if it's supported because it's more performant
2364             let irq_chip = cfg.irq_chip.unwrap_or(if apic_emulation_supported {
2365                 IrqChipKind::Split
2366             } else {
2367                 IrqChipKind::Userspace
2368             });
2369 
2370             // Both WHPX irq chips use a userspace IOAPIC
2371             let (ioapic_host_tube, ioapic_device_tube) =
2372                 Tube::pair().exit_context(Exit::CreateTube, "failed to create tube")?;
2373 
2374             info!("Creating Whpx");
2375             let whpx = Whpx::new()?;
2376             let guest_mem = create_guest_memory(&components, &arch_memory_layout, &whpx)?;
2377             let vm = create_whpx_vm(
2378                 whpx,
2379                 guest_mem,
2380                 components.vcpu_count,
2381                 no_smt,
2382                 apic_emulation_supported && irq_chip == IrqChipKind::Split,
2383                 cfg.force_calibrated_tsc_leaf,
2384                 vm_evt_wrtube
2385                     .try_clone()
2386                     .expect("could not clone vm_evt_wrtube"),
2387             )?;
2388 
2389             let mut irq_chip = match irq_chip {
2390                 IrqChipKind::Kernel => unimplemented!("Kernel irqchip mode not supported by WHPX"),
2391                 IrqChipKind::Split => {
2392                     if !apic_emulation_supported {
2393                         panic!(
2394                             "split irqchip specified but your WHPX version does not support \
2395                                local apic emulation"
2396                         );
2397                     }
2398                     WindowsIrqChip::WhpxSplit(create_whpx_split_irq_chip(&vm, ioapic_device_tube)?)
2399                 }
2400                 IrqChipKind::Userspace => {
2401                     WindowsIrqChip::Userspace(create_userspace_irq_chip::<WhpxVcpu>(
2402                         components.vcpu_count,
2403                         ioapic_device_tube,
2404                     )?)
2405                 }
2406             };
2407             run_vm::<WhpxVcpu, WhpxVm>(
2408                 cfg,
2409                 components,
2410                 &arch_memory_layout,
2411                 vm,
2412                 irq_chip.as_mut(),
2413                 Some(ioapic_host_tube),
2414                 vm_evt_wrtube,
2415                 vm_evt_rdtube,
2416             )
2417         }
2418         #[cfg(feature = "gvm")]
2419         HypervisorKind::Gvm => {
2420             info!("Creating GVM");
2421             let gvm = Gvm::new()?;
2422             let guest_mem = create_guest_memory(&components, &arch_memory_layout, &gvm)?;
2423             let vm = create_gvm_vm(gvm, guest_mem)?;
2424             let ioapic_host_tube;
2425             let mut irq_chip = match cfg.irq_chip.unwrap_or(IrqChipKind::Kernel) {
2426                 IrqChipKind::Split => unimplemented!("Split irqchip mode not supported by GVM"),
2427                 IrqChipKind::Kernel => {
2428                     ioapic_host_tube = None;
2429                     WindowsIrqChip::Gvm(create_gvm_irq_chip(&vm, components.vcpu_count)?)
2430                 }
2431                 IrqChipKind::Userspace => {
2432                     let (host_tube, ioapic_device_tube) =
2433                         Tube::pair().exit_context(Exit::CreateTube, "failed to create tube")?;
2434                     ioapic_host_tube = Some(host_tube);
2435                     WindowsIrqChip::Userspace(create_userspace_irq_chip::<GvmVcpu>(
2436                         components.vcpu_count,
2437                         ioapic_device_tube,
2438                     )?)
2439                 }
2440             };
2441             run_vm::<GvmVcpu, GvmVm>(
2442                 cfg,
2443                 components,
2444                 &arch_memory_layout,
2445                 vm,
2446                 irq_chip.as_mut(),
2447                 ioapic_host_tube,
2448                 vm_evt_wrtube,
2449                 vm_evt_rdtube,
2450             )
2451         }
2452     }
2453 }
2454 
2455 #[cfg(any(feature = "haxm", feature = "gvm", feature = "whpx"))]
run_vm<Vcpu, V>( #[allow(unused_mut)] mut cfg: Config, #[allow(unused_mut)] mut components: VmComponents, arch_memory_layout: &<Arch as LinuxArch>::ArchMemoryLayout, mut vm: V, irq_chip: &mut dyn IrqChipArch, ioapic_host_tube: Option<Tube>, vm_evt_wrtube: SendTube, vm_evt_rdtube: RecvTube, ) -> Result<ExitState> where Vcpu: VcpuArch + 'static, V: VmArch + 'static,2456 fn run_vm<Vcpu, V>(
2457     #[allow(unused_mut)] mut cfg: Config,
2458     #[allow(unused_mut)] mut components: VmComponents,
2459     arch_memory_layout: &<Arch as LinuxArch>::ArchMemoryLayout,
2460     mut vm: V,
2461     irq_chip: &mut dyn IrqChipArch,
2462     ioapic_host_tube: Option<Tube>,
2463     vm_evt_wrtube: SendTube,
2464     vm_evt_rdtube: RecvTube,
2465 ) -> Result<ExitState>
2466 where
2467     Vcpu: VcpuArch + 'static,
2468     V: VmArch + 'static,
2469 {
2470     let vm_memory_size_mb = components.memory_size / (1024 * 1024);
2471     let mut control_tubes = Vec::new();
2472     let mut irq_control_tubes = Vec::new();
2473     let mut vm_memory_control_tubes = Vec::new();
2474     // Create one control tube per disk.
2475     let mut disk_device_tubes = Vec::new();
2476     let mut disk_host_tubes = Vec::new();
2477     let disk_count = cfg.disks.len();
2478     for _ in 0..disk_count {
2479         let (disk_host_tube, disk_device_tube) =
2480             Tube::pair().exit_context(Exit::CreateTube, "failed to create tube")?;
2481         disk_host_tubes.push(disk_host_tube);
2482         disk_device_tubes.push(disk_device_tube);
2483     }
2484 
2485     if let Some(ioapic_host_tube) = ioapic_host_tube {
2486         irq_control_tubes.push(ioapic_host_tube);
2487     }
2488 
2489     // Balloon gets a special socket so balloon requests can be forwarded from the main process.
2490     let (balloon_host_tube, balloon_device_tube) = if cfg.balloon {
2491         let (balloon_host_tube, balloon_device_tube) =
2492             Tube::pair().exit_context(Exit::CreateTube, "failed to create tube")?;
2493         (Some(balloon_host_tube), Some(balloon_device_tube))
2494     } else {
2495         (None, None)
2496     };
2497     // The balloon device also needs a tube to communicate back to the main process to
2498     // handle remapping memory dynamically.
2499     let dynamic_mapping_device_tube = if cfg.balloon {
2500         let (dynamic_mapping_host_tube, dynamic_mapping_device_tube) =
2501             Tube::pair().exit_context(Exit::CreateTube, "failed to create tube")?;
2502         vm_memory_control_tubes.push(dynamic_mapping_host_tube);
2503         Some(dynamic_mapping_device_tube)
2504     } else {
2505         None
2506     };
2507 
2508     // PvClock gets a tube for handling suspend/resume requests from the main thread.
2509     #[cfg(feature = "pvclock")]
2510     let (pvclock_host_tube, pvclock_device_tube) = if cfg.pvclock {
2511         let (host, device) =
2512             Tube::pair().exit_context(Exit::CreateTube, "failed to create tube")?;
2513         (Some(host), Some(device))
2514     } else {
2515         (None, None)
2516     };
2517 
2518     let gralloc = RutabagaGralloc::new(RutabagaGrallocBackendFlags::new())
2519         .exit_context(Exit::CreateGralloc, "failed to create gralloc")?;
2520 
2521     let pstore_size = components.pstore.as_ref().map(|pstore| pstore.size as u64);
2522     let mut sys_allocator = SystemAllocator::new(
2523         Arch::get_system_allocator_config(&vm, arch_memory_layout),
2524         pstore_size,
2525         &cfg.mmio_address_ranges,
2526     )
2527     .context("failed to create system allocator")?;
2528 
2529     // Allocate the ramoops region first.
2530     let ramoops_region = match &components.pstore {
2531         Some(pstore) => Some(
2532             arch::pstore::create_memory_region(
2533                 &mut vm,
2534                 sys_allocator.reserved_region().unwrap(),
2535                 pstore,
2536             )
2537             .exit_context(
2538                 Exit::Pstore,
2539                 format!("failed to allocate pstore region {:?}", &components.pstore),
2540             )?,
2541         ),
2542         None => None,
2543     };
2544 
2545     let init_balloon_size = components
2546         .memory_size
2547         .checked_sub(cfg.init_memory.map_or(components.memory_size, |m| {
2548             m.checked_mul(1024 * 1024).unwrap_or(u64::MAX)
2549         }))
2550         .context("failed to calculate init balloon size")?;
2551 
2552     let tsc_state = devices::tsc::tsc_state().exit_code(Exit::TscCalibrationFailed)?;
2553     let tsc_sync_mitigations = get_tsc_sync_mitigations(&tsc_state, components.vcpu_count);
2554 
2555     if tsc_state.core_grouping.size() > 1 {
2556         // Host TSCs are not in sync, log a metric about it.
2557         warn!(
2558             "Host TSCs are not in sync, applying the following mitigations: {:?}",
2559             tsc_sync_mitigations
2560         );
2561         log_descriptor(
2562             MetricEventType::TscCoresOutOfSync,
2563             // casting u64 as i64 is a no-op, so we don't lose any part of the bitmask
2564             tsc_state.core_grouping.core_grouping_bitmask() as i64,
2565         );
2566     }
2567 
2568     #[cfg(feature = "gpu")]
2569     let gpu_control_tube = cfg
2570         .gpu_vmm_config
2571         .as_mut()
2572         .and_then(|config| config.gpu_control_host_tube.take());
2573     let product_args = product::get_run_control_args(&mut cfg);
2574 
2575     // We open these files before lowering the token, as in the future a stricter policy may
2576     // prevent it.
2577     let dt_overlays = cfg
2578         .device_tree_overlay
2579         .iter()
2580         .map(|o| {
2581             Ok(DtbOverlay {
2582                 file: open_file_or_duplicate(o.path.as_path(), OpenOptions::new().read(true))
2583                     .with_context(|| {
2584                         format!("failed to open device tree overlay {}", o.path.display())
2585                     })?,
2586             })
2587         })
2588         .collect::<Result<Vec<DtbOverlay>>>()?;
2589 
2590     // Lower the token, locking the main process down to a stricter security policy.
2591     //
2592     // WARNING:
2593     //
2594     // Windows system calls can behave in unusual ways if they happen concurrently to the token
2595     // lowering. For example, access denied can happen if Tube pairs are created in another thread
2596     // (b/281108137), and lower_token happens right before the client pipe is connected. Tubes are
2597     // not privileged resources, but can be broken due to the token changing unexpectedly.
2598     //
2599     // We explicitly lower the token here and *then* call run_control to make it clear that any
2600     // resources that require a privileged token should be created on the main thread & passed into
2601     // run_control, to follow the correct order:
2602     // - Privileged resources are created.
2603     // - Token is lowered.
2604     // - Threads are spawned & may create more non-privileged resources (without fear of the token
2605     //   changing at an undefined time).
2606     //
2607     // Recommendation: If you find your code doesnt work in run_control because of the sandbox, you
2608     // should split any resource creation to before this token lowering & pass the resources into
2609     // run_control. Don't move the token lowering somewhere else without considering multi-threaded
2610     // effects.
2611     #[cfg(feature = "sandbox")]
2612     if sandbox::is_sandbox_target() {
2613         sandbox::TargetServices::get()
2614             .exit_code_from_err("failed to create sandbox")?
2615             .expect("Could not create sandbox!")
2616             .lower_token();
2617     }
2618 
2619     let virtio_snd_state_device_tube = create_snd_state_tube(&mut control_tubes)?;
2620 
2621     let (virtio_snd_host_mute_tube, virtio_snd_device_mute_tube) = create_snd_mute_tube_pair()?;
2622 
2623     let mut initial_audio_session_states: Vec<InitialAudioSessionState> = Vec::new();
2624 
2625     let pci_devices = create_devices(
2626         &mut cfg,
2627         vm.get_memory(),
2628         &vm_evt_wrtube,
2629         &mut irq_control_tubes,
2630         &mut vm_memory_control_tubes,
2631         &mut control_tubes,
2632         &mut disk_device_tubes,
2633         &mut initial_audio_session_states,
2634         balloon_device_tube,
2635         #[cfg(feature = "pvclock")]
2636         pvclock_device_tube,
2637         dynamic_mapping_device_tube,
2638         /* inflate_tube= */ None,
2639         init_balloon_size,
2640         tsc_state.frequency,
2641         virtio_snd_state_device_tube,
2642         virtio_snd_device_mute_tube,
2643     )?;
2644 
2645     let mut vcpu_ids = Vec::new();
2646 
2647     let (vwmdt_host_tube, vmwdt_device_tube) = Tube::pair().context("failed to create tube")?;
2648     let windows = Arch::build_vm::<V, Vcpu>(
2649         components,
2650         arch_memory_layout,
2651         &vm_evt_wrtube,
2652         &mut sys_allocator,
2653         &cfg.serial_parameters,
2654         None,
2655         (cfg.battery_config.as_ref().map(|t| t.type_), None),
2656         vm,
2657         ramoops_region,
2658         pci_devices,
2659         irq_chip,
2660         &mut vcpu_ids,
2661         cfg.dump_device_tree_blob.clone(),
2662         /* debugcon_jail= */ None,
2663         None,
2664         None,
2665         /* guest_suspended_cvar= */ None,
2666         dt_overlays,
2667         cfg.fdt_position,
2668         cfg.no_pmu,
2669     )
2670     .exit_context(Exit::BuildVm, "the architecture failed to build the vm")?;
2671 
2672     #[cfg(feature = "stats")]
2673     let stats = if cfg.exit_stats {
2674         Some(Arc::new(Mutex::new(StatisticsCollector::new())))
2675     } else {
2676         None
2677     };
2678 
2679     run_control(
2680         windows,
2681         sys_allocator,
2682         control_tubes,
2683         irq_control_tubes,
2684         vm_memory_control_tubes,
2685         vm_evt_rdtube,
2686         vm_evt_wrtube,
2687         #[cfg(feature = "gpu")]
2688         gpu_control_tube,
2689         cfg.broker_shutdown_event.take(),
2690         balloon_host_tube,
2691         #[cfg(feature = "pvclock")]
2692         pvclock_host_tube,
2693         disk_host_tubes,
2694         initial_audio_session_states,
2695         gralloc,
2696         #[cfg(feature = "stats")]
2697         stats,
2698         cfg.service_pipe_name,
2699         vm_memory_size_mb,
2700         cfg.host_cpu_topology,
2701         tsc_sync_mitigations,
2702         cfg.force_calibrated_tsc_leaf,
2703         product_args,
2704         match virtio_snd_host_mute_tube {
2705             Some(virtio_snd_host_mute_tube) => vec![virtio_snd_host_mute_tube],
2706             None => vec![],
2707         },
2708         cfg.restore_path,
2709         cfg.socket_path,
2710         cfg.force_s2idle,
2711         cfg.suspended,
2712     )
2713 }
2714 
2715 #[cfg(test)]
2716 mod tests {
2717     use tempfile::TempDir;
2718 
2719     use super::*;
2720 
create_config(test_dir: &TempDir) -> Config2721     fn create_config(test_dir: &TempDir) -> Config {
2722         let mut config = Config::default();
2723 
2724         let dummy_kernel_path = test_dir.path().join("dummy_kernel.txt");
2725         OpenOptions::new()
2726             .create_new(true)
2727             .write(true)
2728             .open(&dummy_kernel_path)
2729             .expect("Could not open file!");
2730         config.executable_path = Some(Executable::Kernel(dummy_kernel_path));
2731 
2732         config
2733     }
2734 
2735     #[test]
2736     #[should_panic(expected = "Did not receive a bios or kernel")]
setup_vm_components_panics_when_no_kernel_provided()2737     fn setup_vm_components_panics_when_no_kernel_provided() {
2738         let mut config =
2739             create_config(&TempDir::new().expect("Could not create temporary directory!"));
2740         config.executable_path = None;
2741         let _ = setup_vm_components(&config);
2742     }
2743 
2744     #[test]
setup_vm_components_stores_memory_in_bytes()2745     fn setup_vm_components_stores_memory_in_bytes() {
2746         let tempdir = TempDir::new().expect("Could not create temporary directory!");
2747         let mut config = create_config(&tempdir);
2748         config.memory = Some(1);
2749         let vm_components = setup_vm_components(&config).expect("failed to setup vm components");
2750         assert_eq!(vm_components.memory_size, 1024 * 1024);
2751     }
2752 
2753     #[test]
setup_vm_components_fails_when_memory_too_large()2754     fn setup_vm_components_fails_when_memory_too_large() {
2755         let tempdir = TempDir::new().expect("Could not create temporary directory!");
2756         let mut config = create_config(&tempdir);
2757         // One mb more than a u64 can hold in bytes
2758         config.memory = Some((u64::MAX / 1024 / 1024) + 1);
2759         setup_vm_components(&config).err().expect("expected error");
2760     }
2761 }
2762