• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2022 The ChromiumOS Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 // TODO(b:240716507): There is huge chunk for code which depends on haxm, whpx or gvm to be enabled
6 // but isn't marked so. Remove this when we do so.
7 #![allow(dead_code, unused_imports, unused_variables, unreachable_code)]
8 
9 pub(crate) mod control_server;
10 pub(crate) mod irq_wait;
11 pub(crate) mod main;
12 #[cfg(not(feature = "crash-report"))]
13 mod panic_hook;
14 
15 mod generic;
16 use generic as product;
17 pub(crate) mod run_vcpu;
18 
19 #[cfg(feature = "whpx")]
20 use std::arch::x86_64::__cpuid;
21 #[cfg(feature = "whpx")]
22 use std::arch::x86_64::__cpuid_count;
23 use std::cmp::Reverse;
24 use std::collections::BTreeMap;
25 use std::collections::HashMap;
26 use std::fs::File;
27 use std::fs::OpenOptions;
28 use std::io::stdin;
29 use std::iter;
30 use std::mem;
31 use std::os::windows::fs::OpenOptionsExt;
32 use std::path::PathBuf;
33 use std::sync::mpsc;
34 use std::sync::Arc;
35 
36 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
37 use aarch64::AArch64 as Arch;
38 use acpi_tables::sdt::SDT;
39 use anyhow::anyhow;
40 use anyhow::bail;
41 use anyhow::Context;
42 use anyhow::Result;
43 use arch::CpuConfigArch;
44 use arch::DtbOverlay;
45 use arch::IrqChipArch;
46 use arch::LinuxArch;
47 use arch::RunnableLinuxVm;
48 use arch::VcpuArch;
49 use arch::VirtioDeviceStub;
50 use arch::VmArch;
51 use arch::VmComponents;
52 use arch::VmImage;
53 use base::enable_high_res_timers;
54 use base::error;
55 use base::info;
56 use base::open_file_or_duplicate;
57 use base::warn;
58 use base::AsRawDescriptor;
59 #[cfg(feature = "gpu")]
60 use base::BlockingMode;
61 use base::CloseNotifier;
62 use base::Event;
63 use base::EventToken;
64 use base::EventType;
65 use base::FlushOnDropTube;
66 #[cfg(feature = "gpu")]
67 use base::FramingMode;
68 use base::FromRawDescriptor;
69 use base::ProtoTube;
70 use base::RawDescriptor;
71 use base::ReadNotifier;
72 use base::RecvTube;
73 use base::SendTube;
74 #[cfg(feature = "gpu")]
75 use base::StreamChannel;
76 use base::Terminal;
77 use base::TriggeredEvent;
78 use base::Tube;
79 use base::TubeError;
80 use base::VmEventType;
81 use base::WaitContext;
82 use broker_ipc::common_child_setup;
83 use broker_ipc::CommonChildStartupArgs;
84 use control_server::ControlServer;
85 use crosvm_cli::sys::windows::exit::Exit;
86 use crosvm_cli::sys::windows::exit::ExitContext;
87 use crosvm_cli::sys::windows::exit::ExitContextAnyhow;
88 use crosvm_cli::sys::windows::exit::ExitContextOption;
89 use devices::create_devices_worker_thread;
90 use devices::serial_device::SerialHardware;
91 use devices::serial_device::SerialParameters;
92 use devices::tsc::get_tsc_sync_mitigations;
93 use devices::tsc::standard_deviation;
94 use devices::tsc::TscSyncMitigations;
95 use devices::virtio;
96 use devices::virtio::block::DiskOption;
97 #[cfg(feature = "audio")]
98 use devices::virtio::snd::common_backend::VirtioSnd;
99 #[cfg(feature = "audio")]
100 use devices::virtio::snd::parameters::Parameters as SndParameters;
101 #[cfg(feature = "gpu")]
102 use devices::virtio::vhost::user::device::gpu::sys::windows::GpuVmmConfig;
103 #[cfg(feature = "gpu")]
104 use devices::virtio::vhost::user::device::gpu::sys::windows::InputEventSplitConfig;
105 #[cfg(feature = "gpu")]
106 use devices::virtio::vhost::user::device::gpu::sys::windows::InputEventVmmConfig;
107 #[cfg(feature = "gpu")]
108 use devices::virtio::vhost::user::gpu::sys::windows::product::GpuBackendConfig as GpuBackendConfigProduct;
109 #[cfg(feature = "gpu")]
110 use devices::virtio::vhost::user::gpu::sys::windows::run_gpu_device_worker;
111 #[cfg(feature = "audio")]
112 use devices::virtio::vhost::user::snd::sys::windows::product::SndBackendConfig as SndBackendConfigProduct;
113 #[cfg(feature = "balloon")]
114 use devices::virtio::BalloonFeatures;
115 #[cfg(feature = "balloon")]
116 use devices::virtio::BalloonMode;
117 use devices::virtio::Console;
118 #[cfg(feature = "gpu")]
119 use devices::virtio::GpuParameters;
120 use devices::BusDeviceObj;
121 #[cfg(feature = "gvm")]
122 use devices::GvmIrqChip;
123 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
124 use devices::IrqChip;
125 use devices::UserspaceIrqChip;
126 use devices::VcpuRunState;
127 use devices::VirtioPciDevice;
128 #[cfg(feature = "whpx")]
129 use devices::WhpxSplitIrqChip;
130 #[cfg(feature = "gpu")]
131 use gpu_display::EventDevice;
132 #[cfg(feature = "gpu")]
133 use gpu_display::WindowProcedureThread;
134 #[cfg(feature = "gpu")]
135 use gpu_display::WindowProcedureThreadBuilder;
136 #[cfg(feature = "gvm")]
137 use hypervisor::gvm::Gvm;
138 #[cfg(feature = "gvm")]
139 use hypervisor::gvm::GvmVcpu;
140 #[cfg(feature = "gvm")]
141 use hypervisor::gvm::GvmVersion;
142 #[cfg(feature = "gvm")]
143 use hypervisor::gvm::GvmVm;
144 #[cfg(feature = "haxm")]
145 use hypervisor::haxm::get_use_ghaxm;
146 #[cfg(feature = "haxm")]
147 use hypervisor::haxm::set_use_ghaxm;
148 #[cfg(feature = "haxm")]
149 use hypervisor::haxm::Haxm;
150 #[cfg(feature = "haxm")]
151 use hypervisor::haxm::HaxmVcpu;
152 #[cfg(feature = "haxm")]
153 use hypervisor::haxm::HaxmVm;
154 #[cfg(feature = "whpx")]
155 use hypervisor::whpx::Whpx;
156 #[cfg(feature = "whpx")]
157 use hypervisor::whpx::WhpxFeature;
158 #[cfg(feature = "whpx")]
159 use hypervisor::whpx::WhpxVcpu;
160 #[cfg(feature = "whpx")]
161 use hypervisor::whpx::WhpxVm;
162 use hypervisor::Hypervisor;
163 #[cfg(feature = "whpx")]
164 use hypervisor::HypervisorCap;
165 #[cfg(feature = "whpx")]
166 use hypervisor::HypervisorX86_64;
167 use hypervisor::ProtectionType;
168 use hypervisor::Vm;
169 use irq_wait::IrqWaitWorker;
170 use jail::FakeMinijailStub as Minijail;
171 #[cfg(not(feature = "crash-report"))]
172 pub(crate) use panic_hook::set_panic_hook;
173 use product::create_snd_mute_tube_pair;
174 #[cfg(any(feature = "haxm", feature = "gvm", feature = "whpx"))]
175 use product::create_snd_state_tube;
176 use product::handle_pvclock_request;
177 use product::merge_session_invariants;
178 use product::run_ime_thread;
179 use product::set_package_name;
180 pub(crate) use product::setup_metrics_reporting;
181 use product::start_service_ipc_listener;
182 use product::RunControlArgs;
183 use product::ServiceVmState;
184 use product::Token;
185 use resources::SystemAllocator;
186 use run_vcpu::run_all_vcpus;
187 use run_vcpu::VcpuRunMode;
188 use rutabaga_gfx::RutabagaGralloc;
189 use rutabaga_gfx::RutabagaGrallocBackendFlags;
190 use smallvec::SmallVec;
191 use sync::Mutex;
192 use tube_transporter::TubeToken;
193 use tube_transporter::TubeTransporterReader;
194 use vm_control::api::VmMemoryClient;
195 #[cfg(feature = "balloon")]
196 use vm_control::BalloonControlCommand;
197 #[cfg(feature = "balloon")]
198 use vm_control::BalloonTube;
199 use vm_control::DeviceControlCommand;
200 use vm_control::IrqHandlerRequest;
201 use vm_control::PvClockCommand;
202 use vm_control::VcpuControl;
203 use vm_control::VmMemoryRegionState;
204 use vm_control::VmMemoryRequest;
205 use vm_control::VmRequest;
206 use vm_control::VmResponse;
207 use vm_control::VmRunMode;
208 use vm_memory::GuestAddress;
209 use vm_memory::GuestMemory;
210 use win_util::ProcessType;
211 #[cfg(feature = "whpx")]
212 use x86_64::cpuid::adjust_cpuid;
213 #[cfg(feature = "whpx")]
214 use x86_64::cpuid::CpuIdContext;
215 #[cfg(all(target_arch = "x86_64", feature = "haxm"))]
216 use x86_64::get_cpu_manufacturer;
217 #[cfg(all(target_arch = "x86_64", feature = "haxm"))]
218 use x86_64::CpuManufacturer;
219 #[cfg(target_arch = "x86_64")]
220 use x86_64::X8664arch as Arch;
221 
222 use crate::crosvm::config::Config;
223 use crate::crosvm::config::Executable;
224 use crate::crosvm::config::InputDeviceOption;
225 #[cfg(any(feature = "gvm", feature = "whpx"))]
226 use crate::crosvm::config::IrqChipKind;
227 #[cfg(feature = "gpu")]
228 use crate::crosvm::config::TouchDeviceOption;
229 use crate::crosvm::config::DEFAULT_TOUCH_DEVICE_HEIGHT;
230 use crate::crosvm::config::DEFAULT_TOUCH_DEVICE_WIDTH;
231 use crate::crosvm::sys::config::HypervisorKind;
232 use crate::crosvm::sys::windows::broker::BrokerTubes;
233 #[cfg(feature = "stats")]
234 use crate::crosvm::sys::windows::stats::StatisticsCollector;
235 #[cfg(feature = "gpu")]
236 pub(crate) use crate::sys::windows::product::get_gpu_product_configs;
237 #[cfg(feature = "audio")]
238 pub(crate) use crate::sys::windows::product::get_snd_product_configs;
239 #[cfg(feature = "gpu")]
240 pub(crate) use crate::sys::windows::product::get_window_procedure_thread_product_configs;
241 use crate::sys::windows::product::log_descriptor;
242 #[cfg(feature = "audio")]
243 pub(crate) use crate::sys::windows::product::num_input_sound_devices;
244 #[cfg(feature = "audio")]
245 pub(crate) use crate::sys::windows::product::num_input_sound_streams;
246 use crate::sys::windows::product::spawn_anti_tamper_thread;
247 use crate::sys::windows::product::MetricEventType;
248 
249 const DEFAULT_GUEST_CID: u64 = 3;
250 
251 // by default, if enabled, the balloon WS features will use 4 bins.
252 const VIRTIO_BALLOON_WS_DEFAULT_NUM_BINS: u8 = 4;
253 
254 enum TaggedControlTube {
255     Vm(FlushOnDropTube),
256     Product(product::TaggedControlTube),
257 }
258 
259 impl ReadNotifier for TaggedControlTube {
get_read_notifier(&self) -> &dyn AsRawDescriptor260     fn get_read_notifier(&self) -> &dyn AsRawDescriptor {
261         match self {
262             Self::Vm(tube) => tube.0.get_read_notifier(),
263             Self::Product(tube) => tube.get_read_notifier(),
264         }
265     }
266 }
267 
268 impl CloseNotifier for TaggedControlTube {
get_close_notifier(&self) -> &dyn AsRawDescriptor269     fn get_close_notifier(&self) -> &dyn AsRawDescriptor {
270         match self {
271             Self::Vm(tube) => tube.0.get_close_notifier(),
272             Self::Product(tube) => tube.get_close_notifier(),
273         }
274     }
275 }
276 
277 pub enum ExitState {
278     Reset,
279     Stop,
280     Crash,
281     #[allow(dead_code)]
282     GuestPanic,
283     WatchdogReset,
284 }
285 
286 type DeviceResult<T = VirtioDeviceStub> = Result<T>;
287 
create_vhost_user_block_device(cfg: &Config, disk_device_tube: Tube) -> DeviceResult288 fn create_vhost_user_block_device(cfg: &Config, disk_device_tube: Tube) -> DeviceResult {
289     let dev = virtio::VhostUserFrontend::new(
290         virtio::DeviceType::Block,
291         virtio::base_features(cfg.protection_type),
292         disk_device_tube,
293         None,
294         None,
295     )
296     .exit_context(
297         Exit::VhostUserBlockDeviceNew,
298         "failed to set up vhost-user block device",
299     )?;
300 
301     Ok(VirtioDeviceStub {
302         dev: Box::new(dev),
303         jail: None,
304     })
305 }
306 
create_block_device(cfg: &Config, disk: &DiskOption, disk_device_tube: Tube) -> DeviceResult307 fn create_block_device(cfg: &Config, disk: &DiskOption, disk_device_tube: Tube) -> DeviceResult {
308     let features = virtio::base_features(cfg.protection_type);
309     let dev = virtio::BlockAsync::new(
310         features,
311         disk.open()?,
312         disk,
313         Some(disk_device_tube),
314         None,
315         None,
316     )
317     .exit_context(Exit::BlockDeviceNew, "failed to create block device")?;
318 
319     Ok(VirtioDeviceStub {
320         dev: Box::new(dev),
321         jail: None,
322     })
323 }
324 
325 #[cfg(feature = "gpu")]
create_vhost_user_gpu_device(base_features: u64, vhost_user_tube: Tube) -> DeviceResult326 fn create_vhost_user_gpu_device(base_features: u64, vhost_user_tube: Tube) -> DeviceResult {
327     let dev = virtio::VhostUserFrontend::new(
328         virtio::DeviceType::Gpu,
329         base_features,
330         vhost_user_tube,
331         None,
332         None,
333     )
334     .exit_context(
335         Exit::VhostUserGpuDeviceNew,
336         "failed to set up vhost-user gpu device",
337     )?;
338 
339     Ok(VirtioDeviceStub {
340         dev: Box::new(dev),
341         jail: None,
342     })
343 }
344 
345 #[cfg(feature = "audio")]
create_snd_device( cfg: &Config, parameters: SndParameters, _product_args: SndBackendConfigProduct, ) -> DeviceResult346 fn create_snd_device(
347     cfg: &Config,
348     parameters: SndParameters,
349     _product_args: SndBackendConfigProduct,
350 ) -> DeviceResult {
351     let features = virtio::base_features(cfg.protection_type);
352     let dev = VirtioSnd::new(features, parameters)
353         .exit_context(Exit::VirtioSoundDeviceNew, "failed to create snd device")?;
354 
355     Ok(VirtioDeviceStub {
356         dev: Box::new(dev),
357         jail: None,
358     })
359 }
360 
361 #[cfg(feature = "audio")]
create_vhost_user_snd_device(base_features: u64, vhost_user_tube: Tube) -> DeviceResult362 fn create_vhost_user_snd_device(base_features: u64, vhost_user_tube: Tube) -> DeviceResult {
363     let dev = virtio::VhostUserFrontend::new(
364         virtio::DeviceType::Sound,
365         base_features,
366         vhost_user_tube,
367         None,
368         None,
369     )
370     .exit_context(
371         Exit::VhostUserSndDeviceNew,
372         "failed to set up vhost-user snd device",
373     )?;
374 
375     Ok(VirtioDeviceStub {
376         dev: Box::new(dev),
377         jail: None,
378     })
379 }
380 
381 #[cfg(feature = "gpu")]
create_multi_touch_device( cfg: &Config, event_pipe: StreamChannel, width: u32, height: u32, name: Option<&str>, idx: u32, ) -> DeviceResult382 fn create_multi_touch_device(
383     cfg: &Config,
384     event_pipe: StreamChannel,
385     width: u32,
386     height: u32,
387     name: Option<&str>,
388     idx: u32,
389 ) -> DeviceResult {
390     let dev = virtio::input::new_multi_touch(
391         idx,
392         event_pipe,
393         width,
394         height,
395         name,
396         virtio::base_features(cfg.protection_type),
397     )
398     .exit_context(Exit::InputDeviceNew, "failed to set up input device")?;
399     Ok(VirtioDeviceStub {
400         dev: Box::new(dev),
401         jail: None,
402     })
403 }
404 
405 #[cfg(feature = "gpu")]
create_mouse_device(cfg: &Config, event_pipe: StreamChannel, idx: u32) -> DeviceResult406 fn create_mouse_device(cfg: &Config, event_pipe: StreamChannel, idx: u32) -> DeviceResult {
407     let dev = virtio::input::new_mouse(idx, event_pipe, virtio::base_features(cfg.protection_type))
408         .exit_context(Exit::InputDeviceNew, "failed to set up input device")?;
409     Ok(VirtioDeviceStub {
410         dev: Box::new(dev),
411         jail: None,
412     })
413 }
414 
415 #[cfg(feature = "slirp")]
create_vhost_user_net_device(cfg: &Config, net_device_tube: Tube) -> DeviceResult416 fn create_vhost_user_net_device(cfg: &Config, net_device_tube: Tube) -> DeviceResult {
417     let features = virtio::base_features(cfg.protection_type);
418     let dev = virtio::VhostUserFrontend::new(
419         virtio::DeviceType::Net,
420         features,
421         net_device_tube,
422         None,
423         None,
424     )
425     .exit_context(
426         Exit::VhostUserNetDeviceNew,
427         "failed to set up vhost-user net device",
428     )?;
429 
430     Ok(VirtioDeviceStub {
431         dev: Box::new(dev),
432         jail: None,
433     })
434 }
435 
create_rng_device(cfg: &Config) -> DeviceResult436 fn create_rng_device(cfg: &Config) -> DeviceResult {
437     let dev = virtio::Rng::new(virtio::base_features(cfg.protection_type))
438         .exit_context(Exit::RngDeviceNew, "failed to set up rng")?;
439 
440     Ok(VirtioDeviceStub {
441         dev: Box::new(dev),
442         jail: None,
443     })
444 }
445 
create_console_device(cfg: &Config, param: &SerialParameters) -> DeviceResult446 fn create_console_device(cfg: &Config, param: &SerialParameters) -> DeviceResult {
447     let mut keep_rds = Vec::new();
448     let evt = Event::new().exit_context(Exit::CreateEvent, "failed to create event")?;
449     let dev = param
450         .create_serial_device::<Console>(cfg.protection_type, &evt, &mut keep_rds)
451         .exit_context(Exit::CreateConsole, "failed to create console device")?;
452 
453     Ok(VirtioDeviceStub {
454         dev: Box::new(dev),
455         jail: None,
456     })
457 }
458 
459 #[cfg(feature = "balloon")]
create_balloon_device( cfg: &Config, balloon_device_tube: Tube, dynamic_mapping_device_tube: Tube, inflate_tube: Option<Tube>, init_balloon_size: u64, ) -> DeviceResult460 fn create_balloon_device(
461     cfg: &Config,
462     balloon_device_tube: Tube,
463     dynamic_mapping_device_tube: Tube,
464     inflate_tube: Option<Tube>,
465     init_balloon_size: u64,
466 ) -> DeviceResult {
467     let balloon_features =
468         (cfg.balloon_page_reporting as u64) << BalloonFeatures::PageReporting as u64;
469     let dev = virtio::Balloon::new(
470         virtio::base_features(cfg.protection_type),
471         balloon_device_tube,
472         VmMemoryClient::new(dynamic_mapping_device_tube),
473         inflate_tube,
474         init_balloon_size,
475         if cfg.strict_balloon {
476             BalloonMode::Strict
477         } else {
478             BalloonMode::Relaxed
479         },
480         balloon_features,
481         #[cfg(feature = "registered_events")]
482         None,
483         VIRTIO_BALLOON_WS_DEFAULT_NUM_BINS,
484     )
485     .exit_context(Exit::BalloonDeviceNew, "failed to create balloon")?;
486 
487     Ok(VirtioDeviceStub {
488         dev: Box::new(dev),
489         jail: None,
490     })
491 }
492 
create_vsock_device(cfg: &Config) -> DeviceResult493 fn create_vsock_device(cfg: &Config) -> DeviceResult {
494     // We only support a single guest, so we can confidently assign a default
495     // CID if one isn't provided. We choose the lowest non-reserved value.
496     let dev = virtio::vsock::Vsock::new(
497         cfg.vsock
498             .as_ref()
499             .map(|cfg| cfg.cid)
500             .unwrap_or(DEFAULT_GUEST_CID),
501         cfg.host_guid.clone(),
502         virtio::base_features(cfg.protection_type),
503     )
504     .exit_context(
505         Exit::UserspaceVsockDeviceNew,
506         "failed to create userspace vsock device",
507     )?;
508 
509     Ok(VirtioDeviceStub {
510         dev: Box::new(dev),
511         jail: None,
512     })
513 }
514 
create_virtio_devices( cfg: &mut Config, vm_evt_wrtube: &SendTube, #[allow(clippy::ptr_arg)] control_tubes: &mut Vec<TaggedControlTube>, disk_device_tubes: &mut Vec<Tube>, balloon_device_tube: Option<Tube>, pvclock_device_tube: Option<Tube>, dynamic_mapping_device_tube: Option<Tube>, inflate_tube: Option<Tube>, init_balloon_size: u64, tsc_frequency: u64, virtio_snd_state_device_tube: Option<Tube>, virtio_snd_control_device_tube: Option<Tube>, ) -> DeviceResult<Vec<VirtioDeviceStub>>515 fn create_virtio_devices(
516     cfg: &mut Config,
517     vm_evt_wrtube: &SendTube,
518     #[allow(clippy::ptr_arg)] control_tubes: &mut Vec<TaggedControlTube>,
519     disk_device_tubes: &mut Vec<Tube>,
520     balloon_device_tube: Option<Tube>,
521     pvclock_device_tube: Option<Tube>,
522     dynamic_mapping_device_tube: Option<Tube>,
523     inflate_tube: Option<Tube>,
524     init_balloon_size: u64,
525     tsc_frequency: u64,
526     virtio_snd_state_device_tube: Option<Tube>,
527     virtio_snd_control_device_tube: Option<Tube>,
528 ) -> DeviceResult<Vec<VirtioDeviceStub>> {
529     let mut devs = Vec::new();
530 
531     if cfg.block_vhost_user_tube.is_empty() {
532         // Disk devices must precede virtio-console devices or the kernel does not boot.
533         // TODO(b/171215421): figure out why this ordering is required and fix it.
534         for disk in &cfg.disks {
535             let disk_device_tube = disk_device_tubes.remove(0);
536             devs.push(create_block_device(cfg, disk, disk_device_tube)?);
537         }
538     } else {
539         info!("Starting up vhost user block backends...");
540         for _disk in &cfg.disks {
541             let disk_device_tube = cfg.block_vhost_user_tube.remove(0);
542             devs.push(create_vhost_user_block_device(cfg, disk_device_tube)?);
543         }
544     }
545 
546     for (_, param) in cfg
547         .serial_parameters
548         .iter()
549         .filter(|(_k, v)| v.hardware == SerialHardware::VirtioConsole)
550     {
551         let dev = create_console_device(cfg, param)?;
552         devs.push(dev);
553     }
554 
555     #[cfg(feature = "audio")]
556     if product::virtio_sound_enabled() {
557         let snd_split_config = cfg
558             .snd_split_config
559             .as_mut()
560             .expect("snd_split_config must exist");
561         let snd_vmm_config = snd_split_config
562             .vmm_config
563             .as_mut()
564             .expect("snd_vmm_config must exist");
565         product::push_snd_control_tubes(control_tubes, snd_vmm_config);
566 
567         match snd_split_config.backend_config.take() {
568             None => {
569                 // No backend config present means the backend is running in another process.
570                 devs.push(create_vhost_user_snd_device(
571                     virtio::base_features(cfg.protection_type),
572                     snd_vmm_config
573                         .main_vhost_user_tube
574                         .take()
575                         .expect("Snd VMM vhost-user tube should be set"),
576                 )?);
577             }
578             Some(backend_config) => {
579                 // Backend config present, so initialize Snd in this process.
580                 devs.push(create_snd_device(
581                     cfg,
582                     backend_config.parameters,
583                     backend_config.product_config,
584                 )?);
585             }
586         }
587     }
588 
589     if let Some(tube) = pvclock_device_tube {
590         product::push_pvclock_device(cfg, &mut devs, tsc_frequency, tube);
591     }
592 
593     devs.push(create_rng_device(cfg)?);
594 
595     #[cfg(feature = "slirp")]
596     if let Some(net_vhost_user_tube) = cfg.net_vhost_user_tube.take() {
597         devs.push(create_vhost_user_net_device(cfg, net_vhost_user_tube)?);
598     }
599 
600     #[cfg(feature = "balloon")]
601     if let (Some(balloon_device_tube), Some(dynamic_mapping_device_tube)) =
602         (balloon_device_tube, dynamic_mapping_device_tube)
603     {
604         devs.push(create_balloon_device(
605             cfg,
606             balloon_device_tube,
607             dynamic_mapping_device_tube,
608             inflate_tube,
609             init_balloon_size,
610         )?);
611     }
612 
613     devs.push(create_vsock_device(cfg)?);
614 
615     #[cfg(feature = "gpu")]
616     let event_devices = if let Some(InputEventSplitConfig {
617         backend_config,
618         vmm_config,
619     }) = cfg.input_event_split_config.take()
620     {
621         devs.extend(
622             create_virtio_input_event_devices(cfg, vmm_config)
623                 .context("create input event devices")?,
624         );
625         backend_config.map(|cfg| cfg.event_devices)
626     } else {
627         None
628     };
629 
630     #[cfg(feature = "gpu")]
631     if let Some(wndproc_thread_vmm_config) = cfg
632         .window_procedure_thread_split_config
633         .as_mut()
634         .map(|split_cfg| &mut split_cfg.vmm_config)
635     {
636         product::push_window_procedure_thread_control_tubes(
637             control_tubes,
638             wndproc_thread_vmm_config,
639         );
640     }
641 
642     #[cfg(feature = "gpu")]
643     let mut wndproc_thread = cfg
644         .window_procedure_thread_split_config
645         .as_mut()
646         .and_then(|cfg| cfg.wndproc_thread_builder.take())
647         .map(WindowProcedureThreadBuilder::start_thread)
648         .transpose()
649         .context("Failed to start the window procedure thread.")?;
650 
651     #[cfg(feature = "gpu")]
652     if let Some(gpu_vmm_config) = cfg.gpu_vmm_config.take() {
653         devs.push(create_virtio_gpu_device(
654             cfg,
655             gpu_vmm_config,
656             event_devices,
657             &mut wndproc_thread,
658             control_tubes,
659         )?);
660     }
661 
662     Ok(devs)
663 }
664 
665 #[cfg(feature = "gpu")]
create_virtio_input_event_devices( cfg: &Config, mut input_event_vmm_config: InputEventVmmConfig, ) -> DeviceResult<Vec<VirtioDeviceStub>>666 fn create_virtio_input_event_devices(
667     cfg: &Config,
668     mut input_event_vmm_config: InputEventVmmConfig,
669 ) -> DeviceResult<Vec<VirtioDeviceStub>> {
670     let mut devs = Vec::new();
671 
672     // Iterate event devices, create the VMM end.
673     let mut multi_touch_pipes = input_event_vmm_config
674         .multi_touch_pipes
675         .drain(..)
676         .enumerate();
677     for input in &cfg.virtio_input {
678         match input {
679             InputDeviceOption::SingleTouch { .. } => {
680                 unimplemented!("--single-touch is no longer supported. Use --multi-touch instead.");
681             }
682             InputDeviceOption::MultiTouch {
683                 width,
684                 height,
685                 name,
686                 ..
687             } => {
688                 let Some((idx, pipe)) = multi_touch_pipes.next() else {
689                     break;
690                 };
691                 let mut width = *width;
692                 let mut height = *height;
693                 if idx == 0 {
694                     if width.is_none() {
695                         width = cfg.display_input_width;
696                     }
697                     if height.is_none() {
698                         height = cfg.display_input_height;
699                     }
700                 }
701                 devs.push(create_multi_touch_device(
702                     cfg,
703                     pipe,
704                     width.unwrap_or(DEFAULT_TOUCH_DEVICE_WIDTH),
705                     height.unwrap_or(DEFAULT_TOUCH_DEVICE_HEIGHT),
706                     name.as_deref(),
707                     idx as u32,
708                 )?);
709             }
710             _ => {}
711         }
712     }
713     drop(multi_touch_pipes);
714 
715     product::push_mouse_device(cfg, &mut input_event_vmm_config, &mut devs)?;
716 
717     for (idx, pipe) in input_event_vmm_config.mouse_pipes.drain(..).enumerate() {
718         devs.push(create_mouse_device(cfg, pipe, idx as u32)?);
719     }
720 
721     let keyboard_pipe = input_event_vmm_config
722         .keyboard_pipes
723         .pop()
724         .expect("at least one keyboard should be in GPU VMM config");
725     let dev = virtio::input::new_keyboard(
726         /* idx= */ 0,
727         keyboard_pipe,
728         virtio::base_features(cfg.protection_type),
729     )
730     .exit_context(Exit::InputDeviceNew, "failed to set up input device")?;
731 
732     devs.push(VirtioDeviceStub {
733         dev: Box::new(dev),
734         jail: None,
735     });
736 
737     Ok(devs)
738 }
739 
740 #[cfg(feature = "gpu")]
create_virtio_gpu_device( cfg: &mut Config, mut gpu_vmm_config: GpuVmmConfig, event_devices: Option<Vec<EventDevice>>, wndproc_thread: &mut Option<WindowProcedureThread>, #[allow(clippy::ptr_arg)] control_tubes: &mut Vec<TaggedControlTube>, ) -> DeviceResult<VirtioDeviceStub>741 fn create_virtio_gpu_device(
742     cfg: &mut Config,
743     mut gpu_vmm_config: GpuVmmConfig,
744     event_devices: Option<Vec<EventDevice>>,
745     wndproc_thread: &mut Option<WindowProcedureThread>,
746     #[allow(clippy::ptr_arg)] control_tubes: &mut Vec<TaggedControlTube>,
747 ) -> DeviceResult<VirtioDeviceStub> {
748     let resource_bridges = Vec::<Tube>::new();
749 
750     product::push_gpu_control_tubes(control_tubes, &mut gpu_vmm_config);
751 
752     // If the GPU backend is passed, start up the vhost-user worker in the main process.
753     if let Some(backend_config) = cfg.gpu_backend_config.take() {
754         let event_devices = event_devices.ok_or_else(|| {
755             anyhow!("event devices are missing when creating virtio-gpu in the current process.")
756         })?;
757         let wndproc_thread = wndproc_thread
758             .take()
759             .ok_or_else(|| anyhow!("Window procedure thread is missing."))?;
760 
761         std::thread::spawn(move || {
762             run_gpu_device_worker(backend_config, event_devices, wndproc_thread)
763         });
764     }
765 
766     // The GPU is always vhost-user, even if running in the main process.
767     create_vhost_user_gpu_device(
768         virtio::base_features(cfg.protection_type),
769         gpu_vmm_config
770             .main_vhost_user_tube
771             .take()
772             .expect("GPU VMM vhost-user tube should be set"),
773     )
774     .context("create vhost-user GPU device")
775 }
776 
create_devices( cfg: &mut Config, mem: &GuestMemory, exit_evt_wrtube: &SendTube, irq_control_tubes: &mut Vec<Tube>, vm_memory_control_tubes: &mut Vec<Tube>, control_tubes: &mut Vec<TaggedControlTube>, disk_device_tubes: &mut Vec<Tube>, balloon_device_tube: Option<Tube>, pvclock_device_tube: Option<Tube>, dynamic_mapping_device_tube: Option<Tube>, inflate_tube: Option<Tube>, init_balloon_size: u64, tsc_frequency: u64, virtio_snd_state_device_tube: Option<Tube>, virtio_snd_control_device_tube: Option<Tube>, ) -> DeviceResult<Vec<(Box<dyn BusDeviceObj>, Option<Minijail>)>>777 fn create_devices(
778     cfg: &mut Config,
779     mem: &GuestMemory,
780     exit_evt_wrtube: &SendTube,
781     irq_control_tubes: &mut Vec<Tube>,
782     vm_memory_control_tubes: &mut Vec<Tube>,
783     control_tubes: &mut Vec<TaggedControlTube>,
784     disk_device_tubes: &mut Vec<Tube>,
785     balloon_device_tube: Option<Tube>,
786     pvclock_device_tube: Option<Tube>,
787     dynamic_mapping_device_tube: Option<Tube>,
788     inflate_tube: Option<Tube>,
789     init_balloon_size: u64,
790     tsc_frequency: u64,
791     virtio_snd_state_device_tube: Option<Tube>,
792     virtio_snd_control_device_tube: Option<Tube>,
793 ) -> DeviceResult<Vec<(Box<dyn BusDeviceObj>, Option<Minijail>)>> {
794     let stubs = create_virtio_devices(
795         cfg,
796         exit_evt_wrtube,
797         control_tubes,
798         disk_device_tubes,
799         balloon_device_tube,
800         pvclock_device_tube,
801         dynamic_mapping_device_tube,
802         inflate_tube,
803         init_balloon_size,
804         tsc_frequency,
805         virtio_snd_state_device_tube,
806         virtio_snd_control_device_tube,
807     )?;
808 
809     let mut pci_devices = Vec::new();
810 
811     for stub in stubs {
812         let (msi_host_tube, msi_device_tube) =
813             Tube::pair().exit_context(Exit::CreateTube, "failed to create tube")?;
814         irq_control_tubes.push(msi_host_tube);
815 
816         let shared_memory_tube = if stub.dev.get_shared_memory_region().is_some() {
817             let (host_tube, device_tube) =
818                 Tube::pair().context("failed to create VVU proxy tube")?;
819             vm_memory_control_tubes.push(host_tube);
820             Some(device_tube)
821         } else {
822             None
823         };
824 
825         let (ioevent_host_tube, ioevent_device_tube) =
826             Tube::pair().context("failed to create ioevent tube")?;
827         vm_memory_control_tubes.push(ioevent_host_tube);
828 
829         let (vm_control_host_tube, vm_control_device_tube) =
830             Tube::pair().context("failed to create vm_control tube")?;
831         control_tubes.push(TaggedControlTube::Vm(FlushOnDropTube::from(
832             vm_control_host_tube,
833         )));
834 
835         let dev = Box::new(
836             VirtioPciDevice::new(
837                 mem.clone(),
838                 stub.dev,
839                 msi_device_tube,
840                 cfg.disable_virtio_intx,
841                 shared_memory_tube.map(VmMemoryClient::new),
842                 VmMemoryClient::new(ioevent_device_tube),
843                 vm_control_device_tube,
844             )
845             .exit_context(Exit::VirtioPciDev, "failed to create virtio pci dev")?,
846         ) as Box<dyn BusDeviceObj>;
847         pci_devices.push((dev, stub.jail));
848     }
849 
850     Ok(pci_devices)
851 }
852 
853 #[derive(Debug)]
854 struct PvClockError(String);
855 
handle_readable_event<V: VmArch + 'static, Vcpu: VcpuArch + 'static>( event: &TriggeredEvent<Token>, vm_control_ids_to_remove: &mut Vec<usize>, next_control_id: &mut usize, service_vm_state: &mut ServiceVmState, disk_host_tubes: &[Tube], ipc_main_loop_tube: Option<&Tube>, #[cfg(feature = "gpu")] gpu_control_tube: Option<&Tube>, vm_evt_rdtube: &RecvTube, control_tubes: &mut BTreeMap<usize, TaggedControlTube>, guest_os: &mut RunnableLinuxVm<V, Vcpu>, sys_allocator_mutex: &Arc<Mutex<SystemAllocator>>, virtio_snd_host_mute_tube: &mut Option<Tube>, proto_main_loop_tube: Option<&ProtoTube>, anti_tamper_main_thread_tube: &Option<ProtoTube>, #[cfg(feature = "balloon")] mut balloon_tube: Option<&mut BalloonTube>, memory_size_mb: u64, vcpu_boxes: &Mutex<Vec<Box<dyn VcpuArch>>>, pvclock_host_tube: &Option<Tube>, run_mode_arc: &VcpuRunMode, region_state: &mut VmMemoryRegionState, vm_control_server: Option<&mut ControlServer>, irq_handler_control: &Tube, device_ctrl_tube: &Tube, wait_ctx: &WaitContext<Token>, force_s2idle: bool, vcpu_control_channels: &[mpsc::Sender<VcpuControl>], ) -> Result<Option<ExitState>>856 fn handle_readable_event<V: VmArch + 'static, Vcpu: VcpuArch + 'static>(
857     event: &TriggeredEvent<Token>,
858     vm_control_ids_to_remove: &mut Vec<usize>,
859     next_control_id: &mut usize,
860     service_vm_state: &mut ServiceVmState,
861     disk_host_tubes: &[Tube],
862     ipc_main_loop_tube: Option<&Tube>,
863     #[cfg(feature = "gpu")] gpu_control_tube: Option<&Tube>,
864     vm_evt_rdtube: &RecvTube,
865     control_tubes: &mut BTreeMap<usize, TaggedControlTube>,
866     guest_os: &mut RunnableLinuxVm<V, Vcpu>,
867     sys_allocator_mutex: &Arc<Mutex<SystemAllocator>>,
868     virtio_snd_host_mute_tube: &mut Option<Tube>,
869     proto_main_loop_tube: Option<&ProtoTube>,
870     anti_tamper_main_thread_tube: &Option<ProtoTube>,
871     #[cfg(feature = "balloon")] mut balloon_tube: Option<&mut BalloonTube>,
872     memory_size_mb: u64,
873     vcpu_boxes: &Mutex<Vec<Box<dyn VcpuArch>>>,
874     pvclock_host_tube: &Option<Tube>,
875     run_mode_arc: &VcpuRunMode,
876     region_state: &mut VmMemoryRegionState,
877     vm_control_server: Option<&mut ControlServer>,
878     irq_handler_control: &Tube,
879     device_ctrl_tube: &Tube,
880     wait_ctx: &WaitContext<Token>,
881     force_s2idle: bool,
882     vcpu_control_channels: &[mpsc::Sender<VcpuControl>],
883 ) -> Result<Option<ExitState>> {
884     let execute_vm_request = |request: VmRequest, guest_os: &mut RunnableLinuxVm<V, Vcpu>| {
885         let mut run_mode_opt = None;
886         let vcpu_size = vcpu_boxes.lock().len();
887         let resp = request.execute(
888             &guest_os.vm,
889             &mut run_mode_opt,
890             disk_host_tubes,
891             &mut guest_os.pm,
892             #[cfg(feature = "gpu")]
893             gpu_control_tube,
894             #[cfg(not(feature = "gpu"))]
895             None,
896             None,
897             &mut None,
898             |msg| {
899                 kick_all_vcpus(
900                     run_mode_arc,
901                     vcpu_control_channels,
902                     vcpu_boxes,
903                     guest_os.irq_chip.as_ref(),
904                     pvclock_host_tube,
905                     msg,
906                 );
907             },
908             force_s2idle,
909             #[cfg(feature = "swap")]
910             None,
911             device_ctrl_tube,
912             vcpu_size,
913             irq_handler_control,
914             || guest_os.irq_chip.as_ref().snapshot(vcpu_size),
915         );
916         (resp, run_mode_opt)
917     };
918 
919     match event.token {
920         Token::VmEvent => match vm_evt_rdtube.recv::<VmEventType>() {
921             Ok(vm_event) => {
922                 let exit_state = match vm_event {
923                     VmEventType::Exit => {
924                         info!("vcpu requested shutdown");
925                         Some(ExitState::Stop)
926                     }
927                     VmEventType::Reset => {
928                         info!("vcpu requested reset");
929                         Some(ExitState::Reset)
930                     }
931                     VmEventType::Crash => {
932                         info!("vcpu crashed");
933                         Some(ExitState::Crash)
934                     }
935                     VmEventType::Panic(_) => {
936                         error!("got pvpanic event. this event is not expected on Windows.");
937                         None
938                     }
939                     VmEventType::WatchdogReset => {
940                         info!("vcpu stall detected");
941                         Some(ExitState::WatchdogReset)
942                     }
943                 };
944                 return Ok(exit_state);
945             }
946             Err(e) => {
947                 warn!("failed to recv VmEvent: {}", e);
948             }
949         },
950         Token::BrokerShutdown => {
951             info!("main loop got broker shutdown event");
952             return Ok(Some(ExitState::Stop));
953         }
954         Token::VmControlServer => {
955             let server =
956                 vm_control_server.expect("control server must exist if this event triggers");
957             let client = server.accept();
958             let id = *next_control_id;
959             *next_control_id += 1;
960             wait_ctx
961                 .add(client.0.get_read_notifier(), Token::VmControl { id })
962                 .exit_context(
963                     Exit::WaitContextAdd,
964                     "failed to add trigger to wait context",
965                 )?;
966             wait_ctx
967                 .add(client.0.get_close_notifier(), Token::VmControl { id })
968                 .exit_context(
969                     Exit::WaitContextAdd,
970                     "failed to add trigger to wait context",
971                 )?;
972             control_tubes.insert(id, TaggedControlTube::Vm(client));
973         }
974         #[allow(clippy::collapsible_match)]
975         Token::VmControl { id } => {
976             if let Some(tube) = control_tubes.get(&id) {
977                 #[allow(clippy::single_match)]
978                 match tube {
979                     TaggedControlTube::Product(product_tube) => {
980                         product::handle_tagged_control_tube_event(
981                             product_tube,
982                             virtio_snd_host_mute_tube,
983                             service_vm_state,
984                             ipc_main_loop_tube,
985                         )
986                     }
987                     TaggedControlTube::Vm(tube) => match tube.0.recv::<VmRequest>() {
988                         Ok(request) => {
989                             let mut run_mode_opt = None;
990                             let response = match request {
991                                 VmRequest::HotPlugVfioCommand { device, add } => {
992                                     // Suppress warnings.
993                                     let _ = (device, add);
994                                     unimplemented!("not implemented on Windows");
995                                 }
996                                 #[cfg(feature = "registered_events")]
997                                 VmRequest::RegisterListener { socket_addr, event } => {
998                                     unimplemented!("not implemented on Windows");
999                                 }
1000                                 #[cfg(feature = "registered_events")]
1001                                 VmRequest::UnregisterListener { socket_addr, event } => {
1002                                     unimplemented!("not implemented on Windows");
1003                                 }
1004                                 #[cfg(feature = "registered_events")]
1005                                 VmRequest::Unregister { socket_addr } => {
1006                                     unimplemented!("not implemented on Windows");
1007                                 }
1008                                 #[cfg(feature = "balloon")]
1009                                 VmRequest::BalloonCommand(cmd) => {
1010                                     if let Some(balloon_tube) = balloon_tube {
1011                                         if let Some((r, key)) = balloon_tube.send_cmd(cmd, Some(id))
1012                                         {
1013                                             if key != id {
1014                                                 unimplemented!("not implemented on Windows");
1015                                             }
1016                                             Some(r)
1017                                         } else {
1018                                             None
1019                                         }
1020                                     } else {
1021                                         error!("balloon not enabled");
1022                                         None
1023                                     }
1024                                 }
1025                                 _ => {
1026                                     let (resp, run_mode_ret) =
1027                                         execute_vm_request(request, guest_os);
1028                                     run_mode_opt = run_mode_ret;
1029                                     Some(resp)
1030                                 }
1031                             };
1032 
1033                             if let Some(response) = response {
1034                                 if let Err(e) = tube.0.send(&response) {
1035                                     error!("failed to send VmResponse: {}", e);
1036                                 }
1037                             }
1038                             if let Some(exit_state) =
1039                                 handle_run_mode_change_for_vm_request(&run_mode_opt, guest_os)
1040                             {
1041                                 return Ok(Some(exit_state));
1042                             }
1043                         }
1044                         Err(e) => {
1045                             if let TubeError::Disconnected = e {
1046                                 vm_control_ids_to_remove.push(id);
1047                             } else {
1048                                 error!("failed to recv VmRequest: {}", e);
1049                             }
1050                         }
1051                     },
1052                 }
1053             }
1054         }
1055         #[cfg(feature = "balloon")]
1056         Token::BalloonTube => match balloon_tube.as_mut().expect("missing balloon tube").recv() {
1057             Ok(resp) => {
1058                 for (resp, idx) in resp {
1059                     if let Some(TaggedControlTube::Vm(tube)) = control_tubes.get(&idx) {
1060                         if let Err(e) = tube.0.send(&resp) {
1061                             error!("failed to send VmResponse: {}", e);
1062                         }
1063                     } else {
1064                         error!("Bad tube index {}", idx);
1065                     }
1066                 }
1067             }
1068             Err(err) => {
1069                 error!("Error processing balloon tube {:?}", err)
1070             }
1071         },
1072         #[cfg(not(feature = "balloon"))]
1073         Token::BalloonTube => unreachable!("balloon tube not registered"),
1074         #[allow(unreachable_patterns)]
1075         _ => {
1076             let run_mode_opt = product::handle_received_token(
1077                 &event.token,
1078                 anti_tamper_main_thread_tube,
1079                 #[cfg(feature = "balloon")]
1080                 balloon_tube,
1081                 control_tubes,
1082                 guest_os,
1083                 ipc_main_loop_tube,
1084                 memory_size_mb,
1085                 proto_main_loop_tube,
1086                 pvclock_host_tube,
1087                 run_mode_arc,
1088                 service_vm_state,
1089                 vcpu_boxes,
1090                 virtio_snd_host_mute_tube,
1091                 execute_vm_request,
1092             );
1093             if let Some(exit_state) = handle_run_mode_change_for_vm_request(&run_mode_opt, guest_os)
1094             {
1095                 return Ok(Some(exit_state));
1096             }
1097         }
1098     };
1099     Ok(None)
1100 }
1101 
1102 /// Handles a run mode change (if one occurred) if one is pending as a
1103 /// result a VmRequest. The parameter, run_mode_opt, is the run mode change
1104 /// proposed by the VmRequest's execution.
1105 ///
1106 /// Returns the exit state, if it changed due to a run mode change.
1107 /// None otherwise.
handle_run_mode_change_for_vm_request<V: VmArch + 'static, Vcpu: VcpuArch + 'static>( run_mode_opt: &Option<VmRunMode>, guest_os: &mut RunnableLinuxVm<V, Vcpu>, ) -> Option<ExitState>1108 fn handle_run_mode_change_for_vm_request<V: VmArch + 'static, Vcpu: VcpuArch + 'static>(
1109     run_mode_opt: &Option<VmRunMode>,
1110     guest_os: &mut RunnableLinuxVm<V, Vcpu>,
1111 ) -> Option<ExitState> {
1112     if let Some(run_mode) = run_mode_opt {
1113         info!("control socket changed run mode to {}", run_mode);
1114         match run_mode {
1115             VmRunMode::Exiting => return Some(ExitState::Stop),
1116             other => {
1117                 if other == &VmRunMode::Running {
1118                     for dev in &guest_os.resume_notify_devices {
1119                         dev.lock().resume_imminent();
1120                     }
1121                 }
1122             }
1123         }
1124     }
1125     // No exit state change.
1126     None
1127 }
1128 
1129 /// Commands to control the VM Memory handler thread.
1130 #[derive(serde::Serialize, serde::Deserialize)]
1131 pub enum VmMemoryHandlerRequest {
1132     /// No response is sent for this command.
1133     Exit,
1134 }
1135 
vm_memory_handler_thread( control_tubes: Vec<Tube>, mut vm: impl Vm, sys_allocator_mutex: Arc<Mutex<SystemAllocator>>, mut gralloc: RutabagaGralloc, handler_control: Tube, ) -> anyhow::Result<()>1136 fn vm_memory_handler_thread(
1137     control_tubes: Vec<Tube>,
1138     mut vm: impl Vm,
1139     sys_allocator_mutex: Arc<Mutex<SystemAllocator>>,
1140     mut gralloc: RutabagaGralloc,
1141     handler_control: Tube,
1142 ) -> anyhow::Result<()> {
1143     #[derive(EventToken)]
1144     enum Token {
1145         VmControl { id: usize },
1146         HandlerControl,
1147     }
1148 
1149     let wait_ctx =
1150         WaitContext::build_with(&[(handler_control.get_read_notifier(), Token::HandlerControl)])
1151             .context("failed to build wait context")?;
1152     let mut control_tubes = BTreeMap::from_iter(control_tubes.into_iter().enumerate());
1153     for (id, socket) in control_tubes.iter() {
1154         wait_ctx
1155             .add(socket.get_read_notifier(), Token::VmControl { id: *id })
1156             .context("failed to add descriptor to wait context")?;
1157     }
1158 
1159     let mut region_state = VmMemoryRegionState::new();
1160 
1161     'wait: loop {
1162         let events = {
1163             match wait_ctx.wait() {
1164                 Ok(v) => v,
1165                 Err(e) => {
1166                     error!("failed to poll: {}", e);
1167                     break;
1168                 }
1169             }
1170         };
1171 
1172         let mut vm_control_ids_to_remove = Vec::new();
1173         for event in events.iter().filter(|e| e.is_readable) {
1174             match event.token {
1175                 Token::HandlerControl => match handler_control.recv::<VmMemoryHandlerRequest>() {
1176                     Ok(request) => match request {
1177                         VmMemoryHandlerRequest::Exit => break 'wait,
1178                     },
1179                     Err(e) => {
1180                         if let TubeError::Disconnected = e {
1181                             panic!("vm memory control tube disconnected.");
1182                         } else {
1183                             error!("failed to recv VmMemoryHandlerRequest: {}", e);
1184                         }
1185                     }
1186                 },
1187 
1188                 Token::VmControl { id } => {
1189                     if let Some(tube) = control_tubes.get(&id) {
1190                         match tube.recv::<VmMemoryRequest>() {
1191                             Ok(request) => {
1192                                 let response = request.execute(
1193                                     &mut vm,
1194                                     &mut sys_allocator_mutex.lock(),
1195                                     &mut gralloc,
1196                                     None,
1197                                     &mut region_state,
1198                                 );
1199                                 if let Err(e) = tube.send(&response) {
1200                                     error!("failed to send VmMemoryControlResponse: {}", e);
1201                                 }
1202                             }
1203                             Err(e) => {
1204                                 if let TubeError::Disconnected = e {
1205                                     vm_control_ids_to_remove.push(id);
1206                                 } else {
1207                                     error!("failed to recv VmMemoryControlRequest: {}", e);
1208                                 }
1209                             }
1210                         }
1211                     }
1212                 }
1213             }
1214         }
1215 
1216         remove_closed_tubes(&wait_ctx, &mut control_tubes, vm_control_ids_to_remove)?;
1217         if events
1218             .iter()
1219             .any(|e| e.is_hungup && !e.is_readable && matches!(e.token, Token::HandlerControl))
1220         {
1221             error!("vm memory handler control hung up but did not request an exit.");
1222             break 'wait;
1223         }
1224     }
1225     Ok(())
1226 }
1227 
create_control_server( control_server_path: Option<PathBuf>, wait_ctx: &WaitContext<Token>, ) -> Result<Option<ControlServer>>1228 fn create_control_server(
1229     control_server_path: Option<PathBuf>,
1230     wait_ctx: &WaitContext<Token>,
1231 ) -> Result<Option<ControlServer>> {
1232     #[cfg(not(feature = "prod-build"))]
1233     {
1234         if let Some(path) = control_server_path {
1235             let server =
1236                 ControlServer::new(path.to_str().expect("control socket path must be a string"))
1237                     .exit_context(
1238                         Exit::FailedToCreateControlServer,
1239                         "failed to create control server",
1240                     )?;
1241             wait_ctx
1242                 .add(server.client_waiting(), Token::VmControlServer)
1243                 .exit_context(
1244                     Exit::WaitContextAdd,
1245                     "failed to add control server to wait context",
1246                 )?;
1247             return Ok(Some(server));
1248         }
1249     }
1250     Ok::<Option<ControlServer>, anyhow::Error>(None)
1251 }
1252 
run_control<V: VmArch + 'static, Vcpu: VcpuArch + 'static>( mut guest_os: RunnableLinuxVm<V, Vcpu>, sys_allocator: SystemAllocator, control_tubes: Vec<TaggedControlTube>, irq_control_tubes: Vec<Tube>, vm_memory_control_tubes: Vec<Tube>, vm_evt_rdtube: RecvTube, vm_evt_wrtube: SendTube, #[cfg(feature = "gpu")] gpu_control_tube: Option<Tube>, broker_shutdown_evt: Option<Event>, balloon_host_tube: Option<Tube>, pvclock_host_tube: Option<Tube>, disk_host_tubes: Vec<Tube>, gralloc: RutabagaGralloc, #[cfg(feature = "stats")] stats: Option<Arc<Mutex<StatisticsCollector>>>, service_pipe_name: Option<String>, memory_size_mb: u64, host_cpu_topology: bool, tsc_sync_mitigations: TscSyncMitigations, force_calibrated_tsc_leaf: bool, mut product_args: RunControlArgs, mut virtio_snd_host_mute_tube: Option<Tube>, restore_path: Option<PathBuf>, control_server_path: Option<PathBuf>, force_s2idle: bool, suspended: bool, ) -> Result<ExitState>1253 fn run_control<V: VmArch + 'static, Vcpu: VcpuArch + 'static>(
1254     mut guest_os: RunnableLinuxVm<V, Vcpu>,
1255     sys_allocator: SystemAllocator,
1256     control_tubes: Vec<TaggedControlTube>,
1257     irq_control_tubes: Vec<Tube>,
1258     vm_memory_control_tubes: Vec<Tube>,
1259     vm_evt_rdtube: RecvTube,
1260     vm_evt_wrtube: SendTube,
1261     #[cfg(feature = "gpu")] gpu_control_tube: Option<Tube>,
1262     broker_shutdown_evt: Option<Event>,
1263     balloon_host_tube: Option<Tube>,
1264     pvclock_host_tube: Option<Tube>,
1265     disk_host_tubes: Vec<Tube>,
1266     gralloc: RutabagaGralloc,
1267     #[cfg(feature = "stats")] stats: Option<Arc<Mutex<StatisticsCollector>>>,
1268     service_pipe_name: Option<String>,
1269     memory_size_mb: u64,
1270     host_cpu_topology: bool,
1271     tsc_sync_mitigations: TscSyncMitigations,
1272     force_calibrated_tsc_leaf: bool,
1273     mut product_args: RunControlArgs,
1274     mut virtio_snd_host_mute_tube: Option<Tube>,
1275     restore_path: Option<PathBuf>,
1276     control_server_path: Option<PathBuf>,
1277     force_s2idle: bool,
1278     suspended: bool,
1279 ) -> Result<ExitState> {
1280     let (ipc_main_loop_tube, proto_main_loop_tube, _service_ipc) =
1281         start_service_ipc_listener(service_pipe_name)?;
1282 
1283     let mut service_vm_state = product::create_service_vm_state(memory_size_mb);
1284 
1285     let sys_allocator_mutex = Arc::new(Mutex::new(sys_allocator));
1286 
1287     let exit_evt = Event::new().exit_context(Exit::CreateEvent, "failed to create event")?;
1288     let (irq_handler_control, irq_handler_control_for_worker) = Tube::pair().exit_context(
1289         Exit::CreateTube,
1290         "failed to create IRQ handler control Tube",
1291     )?;
1292 
1293     // Create a separate thread to wait on IRQ events. This is a natural division
1294     // because IRQ interrupts have no dependencies on other events, and this lets
1295     // us avoid approaching the Windows WaitForMultipleObjects 64-object limit.
1296     let irq_join_handle = IrqWaitWorker::start(
1297         irq_handler_control_for_worker,
1298         guest_os
1299             .irq_chip
1300             .try_box_clone()
1301             .exit_context(Exit::CloneEvent, "failed to clone irq chip")?,
1302         irq_control_tubes,
1303         sys_allocator_mutex.clone(),
1304     );
1305 
1306     let mut triggers = vec![(vm_evt_rdtube.get_read_notifier(), Token::VmEvent)];
1307     product::push_triggers(&mut triggers, &ipc_main_loop_tube, &proto_main_loop_tube);
1308     let wait_ctx = WaitContext::build_with(&triggers).exit_context(
1309         Exit::WaitContextAdd,
1310         "failed to add trigger to wait context",
1311     )?;
1312 
1313     #[cfg(feature = "balloon")]
1314     let mut balloon_tube = balloon_host_tube
1315         .map(|tube| -> Result<BalloonTube> {
1316             wait_ctx
1317                 .add(tube.get_read_notifier(), Token::BalloonTube)
1318                 .context("failed to add trigger to wait context")?;
1319             Ok(BalloonTube::new(tube))
1320         })
1321         .transpose()
1322         .context("failed to create balloon tube")?;
1323 
1324     let (vm_memory_handler_control, vm_memory_handler_control_for_thread) = Tube::pair()?;
1325     let vm_memory_handler_thread_join_handle = std::thread::Builder::new()
1326         .name("vm_memory_handler_thread".into())
1327         .spawn({
1328             let vm = guest_os.vm.try_clone().context("failed to clone Vm")?;
1329             let sys_allocator_mutex = sys_allocator_mutex.clone();
1330             move || {
1331                 vm_memory_handler_thread(
1332                     vm_memory_control_tubes,
1333                     vm,
1334                     sys_allocator_mutex,
1335                     gralloc,
1336                     vm_memory_handler_control_for_thread,
1337                 )
1338             }
1339         })
1340         .unwrap();
1341 
1342     if let Some(evt) = broker_shutdown_evt.as_ref() {
1343         wait_ctx.add(evt, Token::BrokerShutdown).exit_context(
1344             Exit::WaitContextAdd,
1345             "failed to add trigger to wait context",
1346         )?;
1347     }
1348 
1349     let mut control_tubes = BTreeMap::from_iter(control_tubes.into_iter().enumerate());
1350     let mut next_control_id = control_tubes.len();
1351     for (id, control_tube) in control_tubes.iter() {
1352         #[allow(clippy::single_match)]
1353         match control_tube {
1354             TaggedControlTube::Product(product_tube) => wait_ctx
1355                 .add(
1356                     product_tube.get_read_notifier(),
1357                     Token::VmControl { id: *id },
1358                 )
1359                 .exit_context(
1360                     Exit::WaitContextAdd,
1361                     "failed to add trigger to wait context",
1362                 )?,
1363             _ => (),
1364         }
1365     }
1366 
1367     let (device_ctrl_tube, device_ctrl_resp) = Tube::pair().context("failed to create tube")?;
1368     guest_os.devices_thread = match create_devices_worker_thread(
1369         guest_os.vm.get_memory().clone(),
1370         guest_os.io_bus.clone(),
1371         guest_os.mmio_bus.clone(),
1372         device_ctrl_resp,
1373     ) {
1374         Ok(join_handle) => Some(join_handle),
1375         Err(e) => {
1376             return Err(anyhow!("Failed to start devices thread: {}", e));
1377         }
1378     };
1379 
1380     let vcpus: Vec<Option<_>> = match guest_os.vcpus.take() {
1381         Some(vec) => vec.into_iter().map(|vcpu| Some(vcpu)).collect(),
1382         None => iter::repeat_with(|| None)
1383             .take(guest_os.vcpu_count)
1384             .collect(),
1385     };
1386 
1387     let anti_tamper_main_thread_tube = spawn_anti_tamper_thread(&wait_ctx);
1388 
1389     let mut vm_control_server = create_control_server(control_server_path, &wait_ctx)?;
1390 
1391     let ime_thread = run_ime_thread(&mut product_args, &exit_evt)?;
1392 
1393     let original_terminal_mode = stdin().set_raw_mode().ok();
1394 
1395     let vcpu_boxes: Arc<Mutex<Vec<Box<dyn VcpuArch>>>> = Arc::new(Mutex::new(Vec::new()));
1396     let run_mode_arc = Arc::new(VcpuRunMode::default());
1397 
1398     let run_mode_state = if suspended {
1399         // Sleep devices before creating vcpus.
1400         device_ctrl_tube
1401             .send(&DeviceControlCommand::SleepDevices)
1402             .context("send command to devices control socket")?;
1403         match device_ctrl_tube
1404             .recv()
1405             .context("receive from devices control socket")?
1406         {
1407             VmResponse::Ok => (),
1408             resp => bail!("device sleep failed: {}", resp),
1409         }
1410         run_mode_arc.set_and_notify(VmRunMode::Suspending);
1411         VmRunMode::Suspending
1412     } else {
1413         VmRunMode::Running
1414     };
1415 
1416     // If we are restoring from a snapshot, then start suspended.
1417     if restore_path.is_some() {
1418         run_mode_arc.set_and_notify(VmRunMode::Suspending);
1419     }
1420 
1421     let (vcpu_threads, vcpu_control_channels) = run_all_vcpus(
1422         vcpus,
1423         vcpu_boxes.clone(),
1424         &guest_os,
1425         &exit_evt,
1426         &vm_evt_wrtube,
1427         #[cfg(feature = "stats")]
1428         &stats,
1429         host_cpu_topology,
1430         run_mode_arc.clone(),
1431         tsc_sync_mitigations,
1432         force_calibrated_tsc_leaf,
1433     )?;
1434 
1435     // Restore VM (if applicable).
1436     if let Some(path) = restore_path {
1437         vm_control::do_restore(
1438             path,
1439             &guest_os.vm,
1440             |msg| {
1441                 kick_all_vcpus(
1442                     run_mode_arc.as_ref(),
1443                     &vcpu_control_channels,
1444                     vcpu_boxes.as_ref(),
1445                     guest_os.irq_chip.as_ref(),
1446                     &pvclock_host_tube,
1447                     msg,
1448                 )
1449             },
1450             |msg, index| {
1451                 kick_vcpu(
1452                     run_mode_arc.as_ref(),
1453                     &vcpu_control_channels,
1454                     vcpu_boxes.as_ref(),
1455                     guest_os.irq_chip.as_ref(),
1456                     &pvclock_host_tube,
1457                     index,
1458                     msg,
1459                 )
1460             },
1461             &irq_handler_control,
1462             &device_ctrl_tube,
1463             guest_os.vcpu_count,
1464             |image| {
1465                 guest_os
1466                     .irq_chip
1467                     .try_box_clone()?
1468                     .restore(image, guest_os.vcpu_count)
1469             },
1470             /* require_encrypted= */ false,
1471         )?;
1472         // Allow the vCPUs to start for real.
1473         kick_all_vcpus(
1474             run_mode_arc.as_ref(),
1475             &vcpu_control_channels,
1476             vcpu_boxes.as_ref(),
1477             guest_os.irq_chip.as_ref(),
1478             &pvclock_host_tube,
1479             // Other platforms (unix) have multiple modes they could start in (e.g. starting for
1480             // guest kernel debugging, etc). If/when we support those modes on Windows, we'll need
1481             // to enter that mode here rather than VmRunMode::Running.
1482             VcpuControl::RunState(run_mode_state),
1483         );
1484     }
1485 
1486     let mut exit_state = ExitState::Stop;
1487     let mut region_state = VmMemoryRegionState::new();
1488 
1489     'poll: loop {
1490         let events = {
1491             match wait_ctx.wait() {
1492                 Ok(v) => v,
1493                 Err(e) => {
1494                     error!("failed to wait: {}", e);
1495                     break;
1496                 }
1497             }
1498         };
1499 
1500         let mut vm_control_ids_to_remove = Vec::new();
1501         for event in events.iter().filter(|e| e.is_readable) {
1502             let state = handle_readable_event(
1503                 event,
1504                 &mut vm_control_ids_to_remove,
1505                 &mut next_control_id,
1506                 &mut service_vm_state,
1507                 disk_host_tubes.as_slice(),
1508                 ipc_main_loop_tube.as_ref(),
1509                 #[cfg(feature = "gpu")]
1510                 gpu_control_tube.as_ref(),
1511                 &vm_evt_rdtube,
1512                 &mut control_tubes,
1513                 &mut guest_os,
1514                 &sys_allocator_mutex,
1515                 &mut virtio_snd_host_mute_tube,
1516                 proto_main_loop_tube.as_ref(),
1517                 &anti_tamper_main_thread_tube,
1518                 #[cfg(feature = "balloon")]
1519                 balloon_tube.as_mut(),
1520                 memory_size_mb,
1521                 vcpu_boxes.as_ref(),
1522                 &pvclock_host_tube,
1523                 run_mode_arc.as_ref(),
1524                 &mut region_state,
1525                 vm_control_server.as_mut(),
1526                 &irq_handler_control,
1527                 &device_ctrl_tube,
1528                 &wait_ctx,
1529                 force_s2idle,
1530                 &vcpu_control_channels,
1531             )?;
1532             if let Some(state) = state {
1533                 exit_state = state;
1534                 break 'poll;
1535             }
1536         }
1537 
1538         remove_closed_tubes(&wait_ctx, &mut control_tubes, vm_control_ids_to_remove)?;
1539     }
1540 
1541     info!("run_control poll loop completed, forcing vCPUs to exit...");
1542 
1543     // VCPU threads MUST see the VmRunMode flag, otherwise they may re-enter the VM.
1544     run_mode_arc.set_and_notify(VmRunMode::Exiting);
1545 
1546     // Force all vcpus to exit from the hypervisor
1547     for vcpu in vcpu_boxes.lock().iter() {
1548         vcpu.set_immediate_exit(true);
1549     }
1550 
1551     let mut res = Ok(exit_state);
1552     guest_os.irq_chip.kick_halted_vcpus();
1553     let _ = exit_evt.signal();
1554 
1555     if guest_os.devices_thread.is_some() {
1556         if let Err(e) = device_ctrl_tube.send(&DeviceControlCommand::Exit) {
1557             error!("failed to stop device control loop: {}", e);
1558         };
1559         if let Some(thread) = guest_os.devices_thread.take() {
1560             if let Err(e) = thread.join() {
1561                 error!("failed to exit devices thread: {:?}", e);
1562             }
1563         }
1564     }
1565 
1566     // Shut down the VM memory handler thread.
1567     if let Err(e) = vm_memory_handler_control.send(&VmMemoryHandlerRequest::Exit) {
1568         error!(
1569             "failed to request exit from VM memory handler thread: {}",
1570             e
1571         );
1572     }
1573     if let Err(e) = vm_memory_handler_thread_join_handle.join() {
1574         error!("failed to exit VM Memory handler thread: {:?}", e);
1575     }
1576 
1577     // Shut down the IRQ handler thread.
1578     if let Err(e) = irq_handler_control.send(&IrqHandlerRequest::Exit) {
1579         error!("failed to request exit from IRQ handler thread: {}", e);
1580     }
1581 
1582     // Ensure any child threads have ended by sending the Exit vm event (possibly again) to ensure
1583     // their run loops are aborted.
1584     let _ = vm_evt_wrtube.send::<VmEventType>(&VmEventType::Exit);
1585     for (i, thread) in vcpu_threads.into_iter().enumerate() {
1586         // wait till all the threads exit, so that guest_os.vm arc memory count is down to 1.
1587         // otherwise, we will hit a memory leak if we force kill the thread with terminate.
1588         match thread.join() {
1589             Ok(Err(e)) => {
1590                 error!("vcpu thread {} exited with an error: {}", i, e);
1591                 res = Err(e);
1592             }
1593             Ok(_) => {}
1594             Err(e) => error!("vcpu thread {} panicked: {:?}", i, e),
1595         }
1596     }
1597 
1598     info!("vCPU threads have exited.");
1599 
1600     if let Some(ime) = ime_thread {
1601         match ime.join() {
1602             Ok(Err(e)) => {
1603                 error!("ime thread exited with an error: {}", e);
1604                 if res.is_ok() {
1605                     // Prioritize past errors, but return this error if it is unique, otherwise just
1606                     // log it.
1607                     res = Err(e)
1608                 }
1609             }
1610             Ok(_) => {}
1611             Err(e) => error!("ime thread panicked: {:?}", e),
1612         }
1613     }
1614     info!("IME thread has exited.");
1615 
1616     // This cancels all the outstanding and any future blocking operations.
1617     // TODO(b/196911556): Shutdown executor for cleaner shutdown. Given we are using global, for a
1618     // cleaner shutdown we have to call disarm so that all the incoming requests are run and are
1619     // cancelled. If we call shutdown all blocking threads will go away and incoming operations
1620     // won't be scheduled to run and will be dropped leading to panic. I think ideal place to call
1621     // shutdown is when we drop non-global executor.
1622     cros_async::unblock_disarm();
1623     info!("blocking async pool has shut down.");
1624 
1625     let _ = irq_join_handle.join();
1626     info!("IrqWaitWorker has shut down.");
1627 
1628     #[cfg(feature = "stats")]
1629     if let Some(stats) = stats {
1630         println!("Statistics Collected:\n{}", stats.lock());
1631         println!("Statistics JSON:\n{}", stats.lock().json());
1632     }
1633 
1634     if let Some(mode) = original_terminal_mode {
1635         if let Err(e) = stdin().restore_mode(mode) {
1636             warn!("failed to restore terminal mode: {}", e);
1637         }
1638     }
1639 
1640     // Explicitly drop the VM structure here to allow the devices to clean up before the
1641     // control tubes are closed when this function exits.
1642     mem::drop(guest_os);
1643 
1644     info!("guest_os dropped, run_control is done.");
1645 
1646     res
1647 }
1648 
1649 /// Remove Tubes that have been closed from the WaitContext.
remove_closed_tubes<T, U>( wait_ctx: &WaitContext<T>, tubes: &mut BTreeMap<usize, U>, mut tube_ids_to_remove: Vec<usize>, ) -> anyhow::Result<()> where T: EventToken, U: ReadNotifier + CloseNotifier,1650 fn remove_closed_tubes<T, U>(
1651     wait_ctx: &WaitContext<T>,
1652     tubes: &mut BTreeMap<usize, U>,
1653     mut tube_ids_to_remove: Vec<usize>,
1654 ) -> anyhow::Result<()>
1655 where
1656     T: EventToken,
1657     U: ReadNotifier + CloseNotifier,
1658 {
1659     tube_ids_to_remove.dedup();
1660     for id in tube_ids_to_remove {
1661         if let Some(socket) = tubes.remove(&id) {
1662             wait_ctx
1663                 .delete(socket.get_read_notifier())
1664                 .context("failed to remove descriptor from wait context")?;
1665 
1666             // There may be a close notifier registered for this Tube. If there isn't one
1667             // registered, we just ignore the error.
1668             let _ = wait_ctx.delete(socket.get_close_notifier());
1669         }
1670     }
1671     Ok(())
1672 }
1673 
1674 /// Sends a message to all VCPUs.
kick_all_vcpus( run_mode: &VcpuRunMode, vcpu_control_channels: &[mpsc::Sender<VcpuControl>], vcpu_boxes: &Mutex<Vec<Box<dyn VcpuArch>>>, irq_chip: &dyn IrqChipArch, pvclock_host_tube: &Option<Tube>, msg: VcpuControl, )1675 fn kick_all_vcpus(
1676     run_mode: &VcpuRunMode,
1677     vcpu_control_channels: &[mpsc::Sender<VcpuControl>],
1678     vcpu_boxes: &Mutex<Vec<Box<dyn VcpuArch>>>,
1679     irq_chip: &dyn IrqChipArch,
1680     pvclock_host_tube: &Option<Tube>,
1681     msg: VcpuControl,
1682 ) {
1683     // On Windows, we handle run mode switching directly rather than delegating to the VCPU thread
1684     // like unix does.
1685     match &msg {
1686         VcpuControl::RunState(VmRunMode::Suspending) => {
1687             suspend_all_vcpus(run_mode, vcpu_boxes, irq_chip, pvclock_host_tube);
1688             return;
1689         }
1690         VcpuControl::RunState(VmRunMode::Running) => {
1691             resume_all_vcpus(run_mode, vcpu_boxes, irq_chip, pvclock_host_tube);
1692             return;
1693         }
1694         _ => (),
1695     }
1696 
1697     // For non RunState commands, we dispatch just like unix would.
1698     for vcpu in vcpu_control_channels {
1699         if let Err(e) = vcpu.send(msg.clone()) {
1700             error!("failed to send VcpuControl message: {}", e);
1701         }
1702     }
1703 
1704     // Now that we've sent a message, we need VCPUs to exit so they can process it.
1705     for vcpu in vcpu_boxes.lock().iter() {
1706         vcpu.set_immediate_exit(true);
1707     }
1708     irq_chip.kick_halted_vcpus();
1709 
1710     // If the VCPU isn't running, we have to notify the run_mode condvar to wake it so it processes
1711     // the control message.
1712     let current_run_mode = run_mode.get_mode();
1713     if current_run_mode != VmRunMode::Running {
1714         run_mode.set_and_notify(current_run_mode);
1715     }
1716 }
1717 
1718 /// Sends a message to a single VCPU. On Windows, `VcpuControl::RunState` cannot be sent to a single
1719 /// VCPU.
kick_vcpu( run_mode: &VcpuRunMode, vcpu_control_channels: &[mpsc::Sender<VcpuControl>], vcpu_boxes: &Mutex<Vec<Box<dyn VcpuArch>>>, irq_chip: &dyn IrqChipArch, pvclock_host_tube: &Option<Tube>, index: usize, msg: VcpuControl, )1720 fn kick_vcpu(
1721     run_mode: &VcpuRunMode,
1722     vcpu_control_channels: &[mpsc::Sender<VcpuControl>],
1723     vcpu_boxes: &Mutex<Vec<Box<dyn VcpuArch>>>,
1724     irq_chip: &dyn IrqChipArch,
1725     pvclock_host_tube: &Option<Tube>,
1726     index: usize,
1727     msg: VcpuControl,
1728 ) {
1729     assert!(
1730         !matches!(msg, VcpuControl::RunState(_)),
1731         "Windows does not support RunState changes on a per VCPU basis"
1732     );
1733 
1734     let vcpu = vcpu_control_channels
1735         .get(index)
1736         .expect("invalid vcpu index specified");
1737     if let Err(e) = vcpu.send(msg) {
1738         error!("failed to send VcpuControl message: {}", e);
1739     }
1740 
1741     // Now that we've sent a message, we need the VCPU to exit so it can
1742     // process the message.
1743     vcpu_boxes
1744         .lock()
1745         .get(index)
1746         .expect("invalid vcpu index specified")
1747         .set_immediate_exit(true);
1748     irq_chip.kick_halted_vcpus();
1749 
1750     // If the VCPU isn't running, we have to notify the run_mode condvar to wake it so it processes
1751     // the control message. (Technically this wakes all VCPUs, but those without messages will go
1752     // back to sleep.)
1753     let current_run_mode = run_mode.get_mode();
1754     if current_run_mode != VmRunMode::Running {
1755         run_mode.set_and_notify(current_run_mode);
1756     }
1757 }
1758 
1759 /// Suspends all VCPUs. The VM will be effectively frozen in time once this function is called,
1760 /// though devices on the host will continue to run.
suspend_all_vcpus( run_mode: &VcpuRunMode, vcpu_boxes: &Mutex<Vec<Box<dyn VcpuArch>>>, irq_chip: &dyn IrqChipArch, pvclock_host_tube: &Option<Tube>, )1761 pub(crate) fn suspend_all_vcpus(
1762     run_mode: &VcpuRunMode,
1763     vcpu_boxes: &Mutex<Vec<Box<dyn VcpuArch>>>,
1764     irq_chip: &dyn IrqChipArch,
1765     pvclock_host_tube: &Option<Tube>,
1766 ) {
1767     // VCPU threads MUST see the VmRunMode::Suspending flag first, otherwise
1768     // they may re-enter the VM.
1769     run_mode.set_and_notify(VmRunMode::Suspending);
1770 
1771     // Force all vcpus to exit from the hypervisor
1772     for vcpu in vcpu_boxes.lock().iter() {
1773         vcpu.set_immediate_exit(true);
1774     }
1775     irq_chip.kick_halted_vcpus();
1776 
1777     handle_pvclock_request(pvclock_host_tube, PvClockCommand::Suspend)
1778         .unwrap_or_else(|e| error!("Error handling pvclock suspend: {:?}", e));
1779 }
1780 
1781 /// Resumes all VCPUs.
resume_all_vcpus( run_mode: &VcpuRunMode, vcpu_boxes: &Mutex<Vec<Box<dyn VcpuArch>>>, irq_chip: &dyn IrqChipArch, pvclock_host_tube: &Option<Tube>, )1782 pub(crate) fn resume_all_vcpus(
1783     run_mode: &VcpuRunMode,
1784     vcpu_boxes: &Mutex<Vec<Box<dyn VcpuArch>>>,
1785     irq_chip: &dyn IrqChipArch,
1786     pvclock_host_tube: &Option<Tube>,
1787 ) {
1788     handle_pvclock_request(pvclock_host_tube, PvClockCommand::Resume)
1789         .unwrap_or_else(|e| error!("Error handling pvclock resume: {:?}", e));
1790 
1791     // Make sure any immediate exit bits are disabled
1792     for vcpu in vcpu_boxes.lock().iter() {
1793         vcpu.set_immediate_exit(false);
1794     }
1795 
1796     run_mode.set_and_notify(VmRunMode::Running);
1797 }
1798 
1799 #[cfg(feature = "gvm")]
1800 const GVM_MINIMUM_VERSION: GvmVersion = GvmVersion {
1801     major: 1,
1802     minor: 4,
1803     patch: 1,
1804 };
1805 
1806 #[cfg(feature = "gvm")]
create_gvm_vm(gvm: Gvm, mem: GuestMemory) -> Result<GvmVm>1807 fn create_gvm_vm(gvm: Gvm, mem: GuestMemory) -> Result<GvmVm> {
1808     match gvm.get_full_version() {
1809         Ok(version) => {
1810             if version < GVM_MINIMUM_VERSION {
1811                 error!(
1812                     "GVM version {} is below minimum version {}",
1813                     version, GVM_MINIMUM_VERSION
1814                 );
1815                 return Err(base::Error::new(libc::ENXIO).into());
1816             } else {
1817                 info!("Using GVM version {}.", version)
1818             }
1819         }
1820         Err(e) => {
1821             error!("unable to determine gvm version: {}", e);
1822             return Err(base::Error::new(libc::ENXIO).into());
1823         }
1824     }
1825     let vm = GvmVm::new(&gvm, mem)?;
1826     Ok(vm)
1827 }
1828 
1829 #[cfg(feature = "haxm")]
create_haxm_vm( haxm: Haxm, mem: GuestMemory, kernel_log_file: &Option<String>, ) -> Result<HaxmVm>1830 fn create_haxm_vm(
1831     haxm: Haxm,
1832     mem: GuestMemory,
1833     kernel_log_file: &Option<String>,
1834 ) -> Result<HaxmVm> {
1835     let vm = HaxmVm::new(&haxm, mem)?;
1836     if let Some(path) = kernel_log_file {
1837         use hypervisor::haxm::HAX_CAP_VM_LOG;
1838         if vm.check_raw_capability(HAX_CAP_VM_LOG) {
1839             match vm.register_log_file(path) {
1840                 Ok(_) => {}
1841                 Err(e) => match e.errno() {
1842                     libc::E2BIG => {
1843                         error!(
1844                             "kernel_log_file path is too long, kernel log file will not be written"
1845                         );
1846                     }
1847                     _ => return Err(e.into()),
1848                 },
1849             }
1850         } else {
1851             warn!(
1852                 "kernel_log_file specified but this version of HAXM does not support kernel log \
1853                   files"
1854             );
1855         }
1856     }
1857     Ok(vm)
1858 }
1859 
1860 #[cfg(feature = "whpx")]
1861 #[cfg(target_arch = "x86_64")]
create_whpx_vm( whpx: Whpx, mem: GuestMemory, cpu_count: usize, no_smt: bool, apic_emulation: bool, force_calibrated_tsc_leaf: bool, vm_evt_wrtube: SendTube, ) -> Result<WhpxVm>1862 fn create_whpx_vm(
1863     whpx: Whpx,
1864     mem: GuestMemory,
1865     cpu_count: usize,
1866     no_smt: bool,
1867     apic_emulation: bool,
1868     force_calibrated_tsc_leaf: bool,
1869     vm_evt_wrtube: SendTube,
1870 ) -> Result<WhpxVm> {
1871     let cpu_config = hypervisor::CpuConfigX86_64::new(
1872         force_calibrated_tsc_leaf,
1873         false, /* host_cpu_topology */
1874         false, /* enable_hwp */
1875         no_smt,
1876         false, /* itmt */
1877         None,  /* hybrid_type */
1878     );
1879 
1880     // context for non-cpu-specific cpuid results
1881     let ctx = CpuIdContext::new(
1882         0,
1883         cpu_count,
1884         None,
1885         cpu_config,
1886         whpx.check_capability(HypervisorCap::CalibratedTscLeafRequired),
1887         __cpuid_count,
1888         __cpuid,
1889     );
1890 
1891     // Get all cpuid entries that we should pre-set
1892     let mut cpuid = whpx.get_supported_cpuid()?;
1893 
1894     // Adjust them for crosvm
1895     for entry in cpuid.cpu_id_entries.iter_mut() {
1896         adjust_cpuid(entry, &ctx);
1897     }
1898 
1899     let vm = WhpxVm::new(
1900         &whpx,
1901         cpu_count,
1902         mem,
1903         cpuid,
1904         apic_emulation,
1905         Some(vm_evt_wrtube),
1906     )
1907     .exit_context(Exit::WhpxSetupError, "failed to create WHPX vm")?;
1908 
1909     Ok(vm)
1910 }
1911 
1912 #[cfg(feature = "gvm")]
create_gvm_irq_chip(vm: &GvmVm, vcpu_count: usize) -> base::Result<GvmIrqChip>1913 fn create_gvm_irq_chip(vm: &GvmVm, vcpu_count: usize) -> base::Result<GvmIrqChip> {
1914     info!("Creating GVM irqchip");
1915     let irq_chip = GvmIrqChip::new(vm.try_clone()?, vcpu_count)?;
1916     Ok(irq_chip)
1917 }
1918 
1919 #[cfg(feature = "whpx")]
1920 #[cfg(target_arch = "x86_64")]
create_whpx_split_irq_chip( vm: &WhpxVm, ioapic_device_tube: Tube, ) -> base::Result<WhpxSplitIrqChip>1921 fn create_whpx_split_irq_chip(
1922     vm: &WhpxVm,
1923     ioapic_device_tube: Tube,
1924 ) -> base::Result<WhpxSplitIrqChip> {
1925     info!("Creating WHPX split irqchip");
1926     WhpxSplitIrqChip::new(
1927         vm.try_clone()?,
1928         ioapic_device_tube,
1929         None, // ioapic_pins
1930     )
1931 }
1932 
create_userspace_irq_chip<Vcpu>( vcpu_count: usize, ioapic_device_tube: Tube, ) -> base::Result<UserspaceIrqChip<Vcpu>> where Vcpu: VcpuArch + 'static,1933 fn create_userspace_irq_chip<Vcpu>(
1934     vcpu_count: usize,
1935     ioapic_device_tube: Tube,
1936 ) -> base::Result<UserspaceIrqChip<Vcpu>>
1937 where
1938     Vcpu: VcpuArch + 'static,
1939 {
1940     info!("Creating userspace irqchip");
1941     let irq_chip =
1942         UserspaceIrqChip::new(vcpu_count, ioapic_device_tube, /* ioapic_pins: */ None)?;
1943     Ok(irq_chip)
1944 }
1945 
get_default_hypervisor() -> Option<HypervisorKind>1946 pub fn get_default_hypervisor() -> Option<HypervisorKind> {
1947     // The ordering here matters from most preferable to the least.
1948     #[cfg(feature = "whpx")]
1949     match hypervisor::whpx::Whpx::is_enabled() {
1950         true => return Some(HypervisorKind::Whpx),
1951         false => warn!("Whpx not enabled."),
1952     };
1953 
1954     #[cfg(feature = "haxm")]
1955     if get_cpu_manufacturer() == CpuManufacturer::Intel {
1956         // Make sure Haxm device can be opened before selecting it.
1957         match Haxm::new() {
1958             Ok(_) => return Some(HypervisorKind::Ghaxm),
1959             Err(e) => warn!("Cannot initialize HAXM: {}", e),
1960         };
1961     }
1962 
1963     #[cfg(feature = "gvm")]
1964     // Make sure Gvm device can be opened before selecting it.
1965     match Gvm::new() {
1966         Ok(_) => return Some(HypervisorKind::Gvm),
1967         Err(e) => warn!("Cannot initialize GVM: {}", e),
1968     };
1969 
1970     None
1971 }
1972 
setup_vm_components(cfg: &Config) -> Result<VmComponents>1973 fn setup_vm_components(cfg: &Config) -> Result<VmComponents> {
1974     let initrd_image = if let Some(initrd_path) = &cfg.initrd_path {
1975         Some(
1976             File::open(initrd_path).with_exit_context(Exit::OpenInitrd, || {
1977                 format!("failed to open initrd {}", initrd_path.display())
1978             })?,
1979         )
1980     } else {
1981         None
1982     };
1983 
1984     let vm_image = match cfg.executable_path {
1985         Some(Executable::Kernel(ref kernel_path)) => VmImage::Kernel(
1986             File::open(kernel_path).with_exit_context(Exit::OpenKernel, || {
1987                 format!("failed to open kernel image {}", kernel_path.display(),)
1988             })?,
1989         ),
1990         Some(Executable::Bios(ref bios_path)) => {
1991             VmImage::Bios(File::open(bios_path).with_exit_context(Exit::OpenBios, || {
1992                 format!("failed to open bios {}", bios_path.display())
1993             })?)
1994         }
1995         _ => panic!("Did not receive a bios or kernel, should be impossible."),
1996     };
1997 
1998     let swiotlb = if let Some(size) = cfg.swiotlb {
1999         Some(
2000             size.checked_mul(1024 * 1024)
2001                 .ok_or_else(|| anyhow!("requested swiotlb size too large"))?,
2002         )
2003     } else if matches!(cfg.protection_type, ProtectionType::Unprotected) {
2004         None
2005     } else {
2006         Some(64 * 1024 * 1024)
2007     };
2008 
2009     let (pflash_image, pflash_block_size) = if let Some(pflash_parameters) = &cfg.pflash_parameters
2010     {
2011         (
2012             Some(
2013                 open_file_or_duplicate(
2014                     &pflash_parameters.path,
2015                     OpenOptions::new().read(true).write(true),
2016                 )
2017                 .with_context(|| {
2018                     format!("failed to open pflash {}", pflash_parameters.path.display())
2019                 })?,
2020             ),
2021             pflash_parameters.block_size,
2022         )
2023     } else {
2024         (None, 0)
2025     };
2026 
2027     Ok(VmComponents {
2028         memory_size: cfg
2029             .memory
2030             .unwrap_or(256)
2031             .checked_mul(1024 * 1024)
2032             .ok_or_else(|| anyhow!("requested memory size too large"))?,
2033         swiotlb,
2034         vcpu_count: cfg.vcpu_count.unwrap_or(1),
2035         fw_cfg_enable: false,
2036         bootorder_fw_cfg_blob: Vec::new(),
2037         vcpu_affinity: cfg.vcpu_affinity.clone(),
2038         cpu_clusters: cfg.cpu_clusters.clone(),
2039         cpu_capacity: cfg.cpu_capacity.clone(),
2040         no_smt: cfg.no_smt,
2041         hugepages: cfg.hugepages,
2042         hv_cfg: hypervisor::Config {
2043             protection_type: cfg.protection_type,
2044         },
2045         vm_image,
2046         android_fstab: cfg
2047             .android_fstab
2048             .as_ref()
2049             .map(|x| {
2050                 File::open(x).with_exit_context(Exit::OpenAndroidFstab, || {
2051                     format!("failed to open android fstab file {}", x.display())
2052                 })
2053             })
2054             .map_or(Ok(None), |v| v.map(Some))?,
2055         pstore: cfg.pstore.clone(),
2056         pflash_block_size,
2057         pflash_image,
2058         initrd_image,
2059         extra_kernel_params: cfg.params.clone(),
2060         acpi_sdts: cfg
2061             .acpi_tables
2062             .iter()
2063             .map(|path| {
2064                 SDT::from_file(path).with_exit_context(Exit::OpenAcpiTable, || {
2065                     format!("failed to open ACPI file {}", path.display())
2066                 })
2067             })
2068             .collect::<Result<Vec<SDT>>>()?,
2069         rt_cpus: cfg.rt_cpus.clone(),
2070         delay_rt: cfg.delay_rt,
2071         no_i8042: cfg.no_i8042,
2072         no_rtc: cfg.no_rtc,
2073         host_cpu_topology: cfg.host_cpu_topology,
2074         #[cfg(target_arch = "x86_64")]
2075         force_s2idle: cfg.force_s2idle,
2076         fw_cfg_parameters: cfg.fw_cfg_parameters.clone(),
2077         itmt: false,
2078         pvm_fw: None,
2079         #[cfg(target_arch = "x86_64")]
2080         pci_low_start: cfg.pci_low_start,
2081         #[cfg(target_arch = "x86_64")]
2082         pcie_ecam: cfg.pcie_ecam,
2083         #[cfg(target_arch = "x86_64")]
2084         smbios: cfg.smbios.clone(),
2085         dynamic_power_coefficient: cfg.dynamic_power_coefficient.clone(),
2086         #[cfg(target_arch = "x86_64")]
2087         break_linux_pci_config_io: cfg.break_linux_pci_config_io,
2088         boot_cpu: cfg.boot_cpu,
2089     })
2090 }
2091 
2092 // Enum that allows us to assign a variable to what is essentially a &dyn IrqChipArch.
2093 enum WindowsIrqChip<V: VcpuArch> {
2094     Userspace(UserspaceIrqChip<V>),
2095     #[cfg(feature = "gvm")]
2096     Gvm(GvmIrqChip),
2097     #[cfg(feature = "whpx")]
2098     WhpxSplit(WhpxSplitIrqChip),
2099 }
2100 
2101 impl<V: VcpuArch> WindowsIrqChip<V> {
2102     // Convert our enum to a &mut dyn IrqChipArch
as_mut(&mut self) -> &mut dyn IrqChipArch2103     fn as_mut(&mut self) -> &mut dyn IrqChipArch {
2104         match self {
2105             WindowsIrqChip::Userspace(i) => i,
2106             #[cfg(feature = "gvm")]
2107             WindowsIrqChip::Gvm(i) => i,
2108             #[cfg(feature = "whpx")]
2109             WindowsIrqChip::WhpxSplit(i) => i,
2110         }
2111     }
2112 }
2113 
2114 /// Storage for the VM TSC offset for each vcpu. Stored in a static because the tracing thread will
2115 /// need access to it when tracing is enabled.
2116 static TSC_OFFSETS: sync::Mutex<Vec<Option<u64>>> = sync::Mutex::new(Vec::new());
2117 
2118 /// Save the TSC offset for a particular vcpu.
2119 ///
2120 /// After setting the TSC offset for a vcpu, this function checks the standard deviation of offsets
2121 /// for all the VCPUs and logs this information. If the TSC offsets differ too much between vcpus
2122 /// it can cause clock issues in the guest.
save_vcpu_tsc_offset(offset: u64, vcpu_id: usize)2123 pub fn save_vcpu_tsc_offset(offset: u64, vcpu_id: usize) {
2124     let offsets_copy = {
2125         let mut offsets = TSC_OFFSETS.lock();
2126         // make sure offsets vec is large enough before inserting
2127         let newlen = std::cmp::max(offsets.len(), vcpu_id + 1);
2128         offsets.resize(newlen, None);
2129         offsets[vcpu_id] = Some(offset);
2130 
2131         offsets.clone()
2132     };
2133 
2134     // do statistics on a clone of the offsets so we don't hold up other vcpus at this point
2135     info!(
2136         "TSC offset standard deviation is: {}",
2137         standard_deviation(
2138             &offsets_copy
2139                 .iter()
2140                 .filter(|x| x.is_some())
2141                 .map(|x| x.unwrap() as u128)
2142                 .collect::<Vec<u128>>()
2143         )
2144     );
2145 }
2146 
2147 /// Get the TSC offset of any vcpu. It will pick the first non-None offset it finds in TSC_OFFSETS.
2148 #[cfg(feature = "perfetto")]
get_vcpu_tsc_offset() -> u642149 pub fn get_vcpu_tsc_offset() -> u64 {
2150     if let Some(offset) = TSC_OFFSETS.lock().iter().flatten().next() {
2151         return *offset;
2152     }
2153     0
2154 }
2155 
2156 /// Callback that is registered with tracing crate, and will be called by the tracing thread when
2157 /// tracing is enabled or disabled. Regardless of whether tracing is being enabled or disabled for
2158 /// a given category or instance, we just emit a clock snapshot that maps the guest TSC to the
2159 /// host TSC. Redundant snapshots should not be a problem for perfetto.
2160 #[cfg(feature = "perfetto")]
set_tsc_clock_snapshot()2161 fn set_tsc_clock_snapshot() {
2162     let freq = match devices::tsc::tsc_frequency() {
2163         Err(e) => {
2164             error!(
2165                 "Could not determine tsc frequency, unable to snapshot tsc offset: {}",
2166                 e
2167             );
2168             return;
2169         }
2170         Ok(freq) => freq,
2171     };
2172 
2173     // The offset is host-guest tsc value
2174     let offset = get_vcpu_tsc_offset();
2175     // Safe because _rdtsc takes no arguments;
2176     let host_tsc = unsafe { std::arch::x86_64::_rdtsc() };
2177     perfetto::snapshot_clock(perfetto::ClockSnapshot::new(
2178         // Technically our multiplier should be freq/1_000_000_000, but perfetto doesn't
2179         // support floating point multipliers yet. So for now we set the freq in Hz and rely
2180         // on the merge tool to fix it.
2181         perfetto::Clock::new(
2182             perfetto::BuiltinClock::Tsc as u32,
2183             host_tsc.wrapping_add(offset),
2184         )
2185         .set_multiplier(freq as u64),
2186         perfetto::Clock::new(
2187             // The host builtin clock ids are all offset from the guest ids by
2188             // HOST_GUEST_CLOCK_ID_OFFSET when the traces are merged. Because this snapshot
2189             // contains both a guest and host clock, we need to offset it before merge.
2190             perfetto::BuiltinClock::Tsc as u32 + cros_tracing::HOST_GUEST_CLOCK_ID_OFFSET,
2191             host_tsc,
2192         )
2193         .set_multiplier(freq as u64),
2194     ));
2195 }
2196 
2197 /// Launches run_config for the broker, reading configuration from a TubeTransporter.
run_config_for_broker(raw_tube_transporter: RawDescriptor) -> Result<ExitState>2198 pub fn run_config_for_broker(raw_tube_transporter: RawDescriptor) -> Result<ExitState> {
2199     let tube_transporter =
2200         // SAFETY:
2201         // Safe because we know that raw_transport_tube is valid (passed by inheritance), and that
2202         // the blocking & framing modes are accurate because we create them ourselves in the broker.
2203         unsafe { TubeTransporterReader::from_raw_descriptor(raw_tube_transporter) };
2204 
2205     let mut tube_data_list = tube_transporter
2206         .read_tubes()
2207         .exit_context(Exit::TubeTransporterInit, "failed to init tube transporter")?;
2208 
2209     let bootstrap_tube = tube_data_list
2210         .get_tube(TubeToken::Bootstrap)
2211         .exit_context(Exit::TubeFailure, "failed to get bootstrap tube")?;
2212 
2213     let mut cfg: Config = bootstrap_tube
2214         .recv::<Config>()
2215         .exit_context(Exit::TubeFailure, "failed to read bootstrap tube")?;
2216 
2217     let startup_args: CommonChildStartupArgs = bootstrap_tube
2218         .recv::<CommonChildStartupArgs>()
2219         .exit_context(Exit::TubeFailure, "failed to read bootstrap tube")?;
2220     let _child_cleanup = common_child_setup(startup_args).exit_context(
2221         Exit::CommonChildSetupError,
2222         "failed to perform common child setup",
2223     )?;
2224 
2225     cfg.broker_shutdown_event = Some(
2226         bootstrap_tube
2227             .recv::<Event>()
2228             .exit_context(Exit::TubeFailure, "failed to read bootstrap tube")?,
2229     );
2230     #[cfg(feature = "crash-report")]
2231     let crash_tube_map = bootstrap_tube
2232         .recv::<HashMap<ProcessType, Vec<SendTube>>>()
2233         .exit_context(Exit::TubeFailure, "failed to read bootstrap tube")?;
2234     #[cfg(feature = "crash-report")]
2235     crash_report::set_crash_tube_map(crash_tube_map);
2236 
2237     let BrokerTubes {
2238         vm_evt_wrtube,
2239         vm_evt_rdtube,
2240     } = bootstrap_tube
2241         .recv::<BrokerTubes>()
2242         .exit_context(Exit::TubeFailure, "failed to read bootstrap tube")?;
2243 
2244     run_config_inner(cfg, vm_evt_wrtube, vm_evt_rdtube)
2245 }
2246 
run_config(cfg: Config) -> Result<ExitState>2247 pub fn run_config(cfg: Config) -> Result<ExitState> {
2248     let _raise_timer_resolution = enable_high_res_timers()
2249         .exit_context(Exit::EnableHighResTimer, "failed to enable high res timer")?;
2250 
2251     // There is no broker when using run_config(), so the vm_evt tubes need to be created.
2252     let (vm_evt_wrtube, vm_evt_rdtube) =
2253         Tube::directional_pair().context("failed to create vm event tube")?;
2254 
2255     run_config_inner(cfg, vm_evt_wrtube, vm_evt_rdtube)
2256 }
2257 
create_guest_memory( components: &VmComponents, hypervisor: &impl Hypervisor, ) -> Result<GuestMemory>2258 fn create_guest_memory(
2259     components: &VmComponents,
2260     hypervisor: &impl Hypervisor,
2261 ) -> Result<GuestMemory> {
2262     let guest_mem_layout = Arch::guest_memory_layout(components, hypervisor).exit_context(
2263         Exit::GuestMemoryLayout,
2264         "failed to create guest memory layout",
2265     )?;
2266     GuestMemory::new_with_options(&guest_mem_layout)
2267         .exit_context(Exit::CreateGuestMemory, "failed to create guest memory")
2268 }
2269 
run_config_inner( cfg: Config, vm_evt_wrtube: SendTube, vm_evt_rdtube: RecvTube, ) -> Result<ExitState>2270 fn run_config_inner(
2271     cfg: Config,
2272     vm_evt_wrtube: SendTube,
2273     vm_evt_rdtube: RecvTube,
2274 ) -> Result<ExitState> {
2275     product::setup_common_metric_invariants(&cfg);
2276 
2277     #[cfg(feature = "perfetto")]
2278     cros_tracing::add_per_trace_callback(set_tsc_clock_snapshot);
2279 
2280     let components: VmComponents = setup_vm_components(&cfg)?;
2281 
2282     #[allow(unused_mut)]
2283     let mut hypervisor = cfg
2284         .hypervisor
2285         .or_else(get_default_hypervisor)
2286         .exit_context(Exit::NoDefaultHypervisor, "no enabled hypervisor")?;
2287 
2288     #[cfg(feature = "whpx")]
2289     if hypervisor::whpx::Whpx::is_enabled() {
2290         // If WHPX is enabled, no other hypervisor can be used, so just override it
2291         hypervisor = HypervisorKind::Whpx;
2292     }
2293 
2294     match hypervisor {
2295         #[cfg(feature = "haxm")]
2296         HypervisorKind::Haxm | HypervisorKind::Ghaxm => {
2297             if hypervisor == HypervisorKind::Haxm {
2298                 set_use_ghaxm(false);
2299             }
2300             info!("Creating HAXM ghaxm={}", get_use_ghaxm());
2301             let haxm = Haxm::new()?;
2302             let guest_mem = create_guest_memory(&components, &haxm)?;
2303             let vm = create_haxm_vm(haxm, guest_mem, &cfg.kernel_log_file)?;
2304             let (ioapic_host_tube, ioapic_device_tube) =
2305                 Tube::pair().exit_context(Exit::CreateTube, "failed to create tube")?;
2306             let irq_chip =
2307                 create_userspace_irq_chip::<HaxmVcpu>(components.vcpu_count, ioapic_device_tube)?;
2308             run_vm::<HaxmVcpu, HaxmVm>(
2309                 cfg,
2310                 components,
2311                 vm,
2312                 WindowsIrqChip::Userspace(irq_chip).as_mut(),
2313                 Some(ioapic_host_tube),
2314                 vm_evt_wrtube,
2315                 vm_evt_rdtube,
2316             )
2317         }
2318         #[cfg(feature = "whpx")]
2319         HypervisorKind::Whpx => {
2320             let apic_emulation_supported =
2321                 Whpx::check_whpx_feature(WhpxFeature::LocalApicEmulation)
2322                     .exit_context(Exit::WhpxSetupError, "failed to set up whpx")?;
2323 
2324             let no_smt = cfg.no_smt;
2325 
2326             // Default to WhpxSplitIrqChip if it's supported because it's more performant
2327             let irq_chip = cfg.irq_chip.unwrap_or(if apic_emulation_supported {
2328                 IrqChipKind::Split
2329             } else {
2330                 IrqChipKind::Userspace
2331             });
2332 
2333             // Both WHPX irq chips use a userspace IOAPIC
2334             let (ioapic_host_tube, ioapic_device_tube) =
2335                 Tube::pair().exit_context(Exit::CreateTube, "failed to create tube")?;
2336 
2337             info!("Creating Whpx");
2338             let whpx = Whpx::new()?;
2339             let guest_mem = create_guest_memory(&components, &whpx)?;
2340             let vm = create_whpx_vm(
2341                 whpx,
2342                 guest_mem,
2343                 components.vcpu_count,
2344                 no_smt,
2345                 apic_emulation_supported && irq_chip == IrqChipKind::Split,
2346                 cfg.force_calibrated_tsc_leaf,
2347                 vm_evt_wrtube
2348                     .try_clone()
2349                     .expect("could not clone vm_evt_wrtube"),
2350             )?;
2351 
2352             let mut irq_chip = match irq_chip {
2353                 IrqChipKind::Kernel => unimplemented!("Kernel irqchip mode not supported by WHPX"),
2354                 IrqChipKind::Split => {
2355                     if !apic_emulation_supported {
2356                         panic!(
2357                             "split irqchip specified but your WHPX version does not support \
2358                                local apic emulation"
2359                         );
2360                     }
2361                     WindowsIrqChip::WhpxSplit(create_whpx_split_irq_chip(&vm, ioapic_device_tube)?)
2362                 }
2363                 IrqChipKind::Userspace => {
2364                     WindowsIrqChip::Userspace(create_userspace_irq_chip::<WhpxVcpu>(
2365                         components.vcpu_count,
2366                         ioapic_device_tube,
2367                     )?)
2368                 }
2369             };
2370             run_vm::<WhpxVcpu, WhpxVm>(
2371                 cfg,
2372                 components,
2373                 vm,
2374                 irq_chip.as_mut(),
2375                 Some(ioapic_host_tube),
2376                 vm_evt_wrtube,
2377                 vm_evt_rdtube,
2378             )
2379         }
2380         #[cfg(feature = "gvm")]
2381         HypervisorKind::Gvm => {
2382             info!("Creating GVM");
2383             let gvm = Gvm::new()?;
2384             let guest_mem = create_guest_memory(&components, &gvm)?;
2385             let vm = create_gvm_vm(gvm, guest_mem)?;
2386             let ioapic_host_tube;
2387             let mut irq_chip = match cfg.irq_chip.unwrap_or(IrqChipKind::Kernel) {
2388                 IrqChipKind::Split => unimplemented!("Split irqchip mode not supported by GVM"),
2389                 IrqChipKind::Kernel => {
2390                     ioapic_host_tube = None;
2391                     WindowsIrqChip::Gvm(create_gvm_irq_chip(&vm, components.vcpu_count)?)
2392                 }
2393                 IrqChipKind::Userspace => {
2394                     let (host_tube, ioapic_device_tube) =
2395                         Tube::pair().exit_context(Exit::CreateTube, "failed to create tube")?;
2396                     ioapic_host_tube = Some(host_tube);
2397                     WindowsIrqChip::Userspace(create_userspace_irq_chip::<GvmVcpu>(
2398                         components.vcpu_count,
2399                         ioapic_device_tube,
2400                     )?)
2401                 }
2402             };
2403             run_vm::<GvmVcpu, GvmVm>(
2404                 cfg,
2405                 components,
2406                 vm,
2407                 irq_chip.as_mut(),
2408                 ioapic_host_tube,
2409                 vm_evt_wrtube,
2410                 vm_evt_rdtube,
2411             )
2412         }
2413     }
2414 }
2415 
2416 #[cfg(any(feature = "haxm", feature = "gvm", feature = "whpx"))]
run_vm<Vcpu, V>( #[allow(unused_mut)] mut cfg: Config, #[allow(unused_mut)] mut components: VmComponents, mut vm: V, irq_chip: &mut dyn IrqChipArch, ioapic_host_tube: Option<Tube>, vm_evt_wrtube: SendTube, vm_evt_rdtube: RecvTube, ) -> Result<ExitState> where Vcpu: VcpuArch + 'static, V: VmArch + 'static,2417 fn run_vm<Vcpu, V>(
2418     #[allow(unused_mut)] mut cfg: Config,
2419     #[allow(unused_mut)] mut components: VmComponents,
2420     mut vm: V,
2421     irq_chip: &mut dyn IrqChipArch,
2422     ioapic_host_tube: Option<Tube>,
2423     vm_evt_wrtube: SendTube,
2424     vm_evt_rdtube: RecvTube,
2425 ) -> Result<ExitState>
2426 where
2427     Vcpu: VcpuArch + 'static,
2428     V: VmArch + 'static,
2429 {
2430     let vm_memory_size_mb = components.memory_size / (1024 * 1024);
2431     let mut control_tubes = Vec::new();
2432     let mut irq_control_tubes = Vec::new();
2433     let mut vm_memory_control_tubes = Vec::new();
2434     // Create one control tube per disk.
2435     let mut disk_device_tubes = Vec::new();
2436     let mut disk_host_tubes = Vec::new();
2437     let disk_count = cfg.disks.len();
2438     for _ in 0..disk_count {
2439         let (disk_host_tube, disk_device_tube) =
2440             Tube::pair().exit_context(Exit::CreateTube, "failed to create tube")?;
2441         disk_host_tubes.push(disk_host_tube);
2442         disk_device_tubes.push(disk_device_tube);
2443     }
2444 
2445     if let Some(ioapic_host_tube) = ioapic_host_tube {
2446         irq_control_tubes.push(ioapic_host_tube);
2447     }
2448 
2449     // Balloon gets a special socket so balloon requests can be forwarded from the main process.
2450     let (balloon_host_tube, balloon_device_tube) = if cfg.balloon {
2451         let (balloon_host_tube, balloon_device_tube) =
2452             Tube::pair().exit_context(Exit::CreateTube, "failed to create tube")?;
2453         (Some(balloon_host_tube), Some(balloon_device_tube))
2454     } else {
2455         (None, None)
2456     };
2457     // The balloon device also needs a tube to communicate back to the main process to
2458     // handle remapping memory dynamically.
2459     let dynamic_mapping_device_tube = if cfg.balloon {
2460         let (dynamic_mapping_host_tube, dynamic_mapping_device_tube) =
2461             Tube::pair().exit_context(Exit::CreateTube, "failed to create tube")?;
2462         vm_memory_control_tubes.push(dynamic_mapping_host_tube);
2463         Some(dynamic_mapping_device_tube)
2464     } else {
2465         None
2466     };
2467 
2468     // PvClock gets a tube for handling suspend/resume requests from the main thread.
2469     let (pvclock_host_tube, pvclock_device_tube) = if cfg.pvclock {
2470         let (host, device) =
2471             Tube::pair().exit_context(Exit::CreateTube, "failed to create tube")?;
2472         (Some(host), Some(device))
2473     } else {
2474         (None, None)
2475     };
2476 
2477     let gralloc = RutabagaGralloc::new(RutabagaGrallocBackendFlags::new())
2478         .exit_context(Exit::CreateGralloc, "failed to create gralloc")?;
2479 
2480     let pstore_size = components.pstore.as_ref().map(|pstore| pstore.size as u64);
2481     let mut sys_allocator = SystemAllocator::new(
2482         Arch::get_system_allocator_config(&vm),
2483         pstore_size,
2484         &cfg.mmio_address_ranges,
2485     )
2486     .context("failed to create system allocator")?;
2487 
2488     // Allocate the ramoops region first.
2489     let ramoops_region = match &components.pstore {
2490         Some(pstore) => Some(
2491             arch::pstore::create_memory_region(
2492                 &mut vm,
2493                 sys_allocator.reserved_region().unwrap(),
2494                 pstore,
2495             )
2496             .exit_context(
2497                 Exit::Pstore,
2498                 format!("failed to allocate pstore region {:?}", &components.pstore),
2499             )?,
2500         ),
2501         None => None,
2502     };
2503 
2504     let init_balloon_size = components
2505         .memory_size
2506         .checked_sub(cfg.init_memory.map_or(components.memory_size, |m| {
2507             m.checked_mul(1024 * 1024).unwrap_or(u64::MAX)
2508         }))
2509         .context("failed to calculate init balloon size")?;
2510 
2511     let tsc_state = devices::tsc::tsc_state().exit_code(Exit::TscCalibrationFailed)?;
2512     let tsc_sync_mitigations = get_tsc_sync_mitigations(&tsc_state, components.vcpu_count);
2513 
2514     if tsc_state.core_grouping.size() > 1 {
2515         // Host TSCs are not in sync, log a metric about it.
2516         warn!(
2517             "Host TSCs are not in sync, applying the following mitigations: {:?}",
2518             tsc_sync_mitigations
2519         );
2520         log_descriptor(
2521             MetricEventType::TscCoresOutOfSync,
2522             // casting u64 as i64 is a no-op, so we don't lose any part of the bitmask
2523             tsc_state.core_grouping.core_grouping_bitmask() as i64,
2524         );
2525     }
2526 
2527     #[cfg(feature = "gpu")]
2528     let gpu_control_tube = cfg
2529         .gpu_vmm_config
2530         .as_mut()
2531         .and_then(|config| config.gpu_control_host_tube.take());
2532     let product_args = product::get_run_control_args(&mut cfg);
2533 
2534     // We open these files before lowering the token, as in the future a stricter policy may
2535     // prevent it.
2536     let dt_overlays = cfg
2537         .device_tree_overlay
2538         .iter()
2539         .map(|o| {
2540             Ok(DtbOverlay {
2541                 file: open_file_or_duplicate(o.path.as_path(), OpenOptions::new().read(true))
2542                     .with_context(|| {
2543                         format!("failed to open device tree overlay {}", o.path.display())
2544                     })?,
2545             })
2546         })
2547         .collect::<Result<Vec<DtbOverlay>>>()?;
2548 
2549     // Lower the token, locking the main process down to a stricter security policy.
2550     //
2551     // WARNING:
2552     //
2553     // Windows system calls can behave in unusual ways if they happen concurrently to the token
2554     // lowering. For example, access denied can happen if Tube pairs are created in another thread
2555     // (b/281108137), and lower_token happens right before the client pipe is connected. Tubes are
2556     // not privileged resources, but can be broken due to the token changing unexpectedly.
2557     //
2558     // We explicitly lower the token here and *then* call run_control to make it clear that any
2559     // resources that require a privileged token should be created on the main thread & passed into
2560     // run_control, to follow the correct order:
2561     // - Privileged resources are created.
2562     // - Token is lowered.
2563     // - Threads are spawned & may create more non-privileged resources (without fear of the token
2564     //   changing at an undefined time).
2565     //
2566     // Recommendation: If you find your code doesnt work in run_control because of the sandbox, you
2567     // should split any resource creation to before this token lowering & pass the resources into
2568     // run_control. Don't move the token lowering somewhere else without considering multi-threaded
2569     // effects.
2570     #[cfg(feature = "sandbox")]
2571     if sandbox::is_sandbox_target() {
2572         sandbox::TargetServices::get()
2573             .exit_code_from_err("failed to create sandbox")?
2574             .expect("Could not create sandbox!")
2575             .lower_token();
2576     }
2577 
2578     let virtio_snd_state_device_tube = create_snd_state_tube(&mut control_tubes)?;
2579 
2580     let (virtio_snd_host_mute_tube, virtio_snd_device_mute_tube) = create_snd_mute_tube_pair()?;
2581 
2582     let pci_devices = create_devices(
2583         &mut cfg,
2584         vm.get_memory(),
2585         &vm_evt_wrtube,
2586         &mut irq_control_tubes,
2587         &mut vm_memory_control_tubes,
2588         &mut control_tubes,
2589         &mut disk_device_tubes,
2590         balloon_device_tube,
2591         pvclock_device_tube,
2592         dynamic_mapping_device_tube,
2593         /* inflate_tube= */ None,
2594         init_balloon_size,
2595         tsc_state.frequency,
2596         virtio_snd_state_device_tube,
2597         virtio_snd_device_mute_tube,
2598     )?;
2599 
2600     let mut vcpu_ids = Vec::new();
2601 
2602     let windows = Arch::build_vm::<V, Vcpu>(
2603         components,
2604         &vm_evt_wrtube,
2605         &mut sys_allocator,
2606         &cfg.serial_parameters,
2607         None,
2608         (cfg.battery_config.as_ref().map(|t| t.type_), None),
2609         vm,
2610         ramoops_region,
2611         pci_devices,
2612         irq_chip,
2613         &mut vcpu_ids,
2614         cfg.dump_device_tree_blob.clone(),
2615         /* debugcon_jail= */ None,
2616         None,
2617         None,
2618         dt_overlays,
2619     )
2620     .exit_context(Exit::BuildVm, "the architecture failed to build the vm")?;
2621 
2622     #[cfg(feature = "stats")]
2623     let stats = if cfg.exit_stats {
2624         Some(Arc::new(Mutex::new(StatisticsCollector::new())))
2625     } else {
2626         None
2627     };
2628 
2629     run_control(
2630         windows,
2631         sys_allocator,
2632         control_tubes,
2633         irq_control_tubes,
2634         vm_memory_control_tubes,
2635         vm_evt_rdtube,
2636         vm_evt_wrtube,
2637         #[cfg(feature = "gpu")]
2638         gpu_control_tube,
2639         cfg.broker_shutdown_event.take(),
2640         balloon_host_tube,
2641         pvclock_host_tube,
2642         disk_host_tubes,
2643         gralloc,
2644         #[cfg(feature = "stats")]
2645         stats,
2646         cfg.service_pipe_name,
2647         vm_memory_size_mb,
2648         cfg.host_cpu_topology,
2649         tsc_sync_mitigations,
2650         cfg.force_calibrated_tsc_leaf,
2651         product_args,
2652         virtio_snd_host_mute_tube,
2653         cfg.restore_path,
2654         cfg.socket_path,
2655         cfg.force_s2idle,
2656         cfg.suspended,
2657     )
2658 }
2659 
2660 #[cfg(test)]
2661 mod tests {
2662     use tempfile::TempDir;
2663 
2664     use super::*;
2665 
create_config(test_dir: &TempDir) -> Config2666     fn create_config(test_dir: &TempDir) -> Config {
2667         let mut config = Config::default();
2668 
2669         let dummy_kernel_path = test_dir.path().join("dummy_kernel.txt");
2670         OpenOptions::new()
2671             .create(true)
2672             .write(true)
2673             .open(&dummy_kernel_path)
2674             .expect("Could not open file!");
2675         config.executable_path = Some(Executable::Kernel(dummy_kernel_path));
2676 
2677         config
2678     }
2679 
2680     #[test]
2681     #[should_panic(expected = "Did not receive a bios or kernel")]
setup_vm_components_panics_when_no_kernel_provided()2682     fn setup_vm_components_panics_when_no_kernel_provided() {
2683         let mut config =
2684             create_config(&TempDir::new().expect("Could not create temporary directory!"));
2685         config.executable_path = None;
2686         let _ = setup_vm_components(&config);
2687     }
2688 
2689     #[test]
setup_vm_components_stores_memory_in_bytes()2690     fn setup_vm_components_stores_memory_in_bytes() {
2691         let tempdir = TempDir::new().expect("Could not create temporary directory!");
2692         let mut config = create_config(&tempdir);
2693         config.memory = Some(1);
2694         let vm_components = setup_vm_components(&config).expect("failed to setup vm components");
2695         assert_eq!(vm_components.memory_size, 1024 * 1024);
2696     }
2697 
2698     #[test]
setup_vm_components_fails_when_memory_too_large()2699     fn setup_vm_components_fails_when_memory_too_large() {
2700         let tempdir = TempDir::new().expect("Could not create temporary directory!");
2701         let mut config = create_config(&tempdir);
2702         // One mb more than a u64 can hold in bytes
2703         config.memory = Some((u64::MAX / 1024 / 1024) + 1);
2704         setup_vm_components(&config).err().expect("expected error");
2705     }
2706 }
2707