• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2021, The Android Open Source Project
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //     http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 //! Functions for running instances of `crosvm`.
16 
17 use crate::aidl::{remove_temporary_files, Cid, GLOBAL_SERVICE, VirtualMachineCallbacks};
18 use crate::atom::{get_num_cpus, write_vm_exited_stats_sync};
19 use crate::debug_config::DebugConfig;
20 use anyhow::{anyhow, bail, Context, Error, Result};
21 use binder::ParcelFileDescriptor;
22 use command_fds::CommandFdExt;
23 use libc::{sysconf, _SC_CLK_TCK};
24 use log::{debug, error, info, warn};
25 use semver::{Version, VersionReq};
26 use nix::{fcntl::OFlag, unistd::pipe2, unistd::Uid, unistd::User};
27 use regex::{Captures, Regex};
28 use rustutils::system_properties;
29 use shared_child::SharedChild;
30 use std::borrow::Cow;
31 use std::cmp::max;
32 use std::ffi::CString;
33 use std::fmt;
34 use std::fs::{read_to_string, File};
35 use std::io::{self, Read};
36 use std::mem;
37 use std::num::{NonZeroU16, NonZeroU32};
38 use std::os::unix::io::{AsRawFd, OwnedFd};
39 use std::os::unix::process::CommandExt;
40 use std::os::unix::process::ExitStatusExt;
41 use std::path::{Path, PathBuf};
42 use std::process::{Command, ExitStatus};
43 use std::sync::{Arc, Condvar, Mutex, LazyLock};
44 use std::time::{Duration, SystemTime};
45 use std::thread::{self, JoinHandle};
46 use android_system_virtualizationcommon::aidl::android::system::virtualizationcommon::DeathReason::DeathReason;
47 use android_system_virtualizationservice::aidl::android::system::virtualizationservice::{
48     VirtualMachineAppConfig::DebugLevel::DebugLevel,
49     AudioConfig::AudioConfig as AudioConfigParcelable,
50     CpuOptions::CpuOptions,
51     CpuOptions::CpuTopology::CpuTopology,
52     DisplayConfig::DisplayConfig as DisplayConfigParcelable,
53     GpuConfig::GpuConfig as GpuConfigParcelable,
54     UsbConfig::UsbConfig as UsbConfigParcelable,
55 };
56 use android_system_virtualizationservice_internal::aidl::android::system::virtualizationservice_internal::IGlobalVmContext::IGlobalVmContext;
57 use android_system_virtualizationservice_internal::aidl::android::system::virtualizationservice_internal::IBoundDevice::IBoundDevice;
58 use binder::Strong;
59 use android_system_virtualmachineservice::aidl::android::system::virtualmachineservice::IVirtualMachineService::IVirtualMachineService;
60 use tombstoned_client::{TombstonedConnection, DebuggerdDumpType};
61 use rpcbinder::RpcServer;
62 
63 const CROSVM_PATH: &str = "/apex/com.android.virt/bin/crosvm";
64 
65 /// Version of the platform that crosvm currently implements. The format follows SemVer. This
66 /// should be updated when there is a platform change in the crosvm side. Having this value here is
67 /// fine because virtualizationservice and crosvm are supposed to be updated together in the virt
68 /// APEX.
69 const CROSVM_PLATFORM_VERSION: &str = "1.0.0";
70 
71 /// The exit status which crosvm returns when it has an error starting a VM.
72 const CROSVM_START_ERROR_STATUS: i32 = 1;
73 /// The exit status which crosvm returns when a VM requests a reboot.
74 const CROSVM_REBOOT_STATUS: i32 = 32;
75 /// The exit status which crosvm returns when it crashes due to an error.
76 const CROSVM_CRASH_STATUS: i32 = 33;
77 /// The exit status which crosvm returns when vcpu is stalled.
78 const CROSVM_WATCHDOG_REBOOT_STATUS: i32 = 36;
79 /// The size of memory (in MiB) reserved for ramdump
80 const RAMDUMP_RESERVED_MIB: u32 = 17;
81 
82 const MILLIS_PER_SEC: i64 = 1000;
83 
84 const SYSPROP_CUSTOM_PVMFW_PATH: &str = "hypervisor.pvmfw.path";
85 
86 /// Serial device for VM console input.
87 /// Hypervisor (virtio-console)
88 const CONSOLE_HVC0: &str = "hvc0";
89 /// Serial (emulated uart)
90 const CONSOLE_TTYS0: &str = "ttyS0";
91 
92 /// If the VM doesn't move to the Started state within this amount time, a hang-up error is
93 /// triggered.
94 static BOOT_HANGUP_TIMEOUT: LazyLock<Duration> = LazyLock::new(|| {
95     if nested_virt::is_nested_virtualization().unwrap() {
96         // Nested virtualization is slow, so we need a longer timeout.
97         Duration::from_secs(300)
98     } else {
99         Duration::from_secs(30)
100     }
101 });
102 
103 /// Configuration for a VM to run with crosvm.
104 #[derive(Debug)]
105 pub struct CrosvmConfig {
106     pub cid: Cid,
107     pub name: String,
108     pub bootloader: Option<File>,
109     pub kernel: Option<File>,
110     pub initrd: Option<File>,
111     pub disks: Vec<DiskFile>,
112     pub shared_paths: Vec<SharedPathConfig>,
113     pub params: Option<String>,
114     pub protected: bool,
115     pub debug_config: DebugConfig,
116     pub memory_mib: NonZeroU32,
117     pub swiotlb_mib: Option<NonZeroU32>,
118     pub cpus: CpuOptions,
119     pub console_out_fd: Option<File>,
120     pub console_in_fd: Option<File>,
121     pub log_fd: Option<File>,
122     pub ramdump: Option<File>,
123     pub indirect_files: Vec<File>,
124     pub platform_version: VersionReq,
125     pub detect_hangup: bool,
126     pub gdb_port: Option<NonZeroU16>,
127     pub vfio_devices: Vec<VfioDevice>,
128     pub dtbo: Option<File>,
129     pub device_tree_overlays: Vec<File>,
130     pub display_config: Option<DisplayConfig>,
131     pub input_device_options: Vec<InputDeviceOption>,
132     pub hugepages: bool,
133     pub tap: Option<File>,
134     pub console_input_device: Option<String>,
135     pub boost_uclamp: bool,
136     pub gpu_config: Option<GpuConfig>,
137     pub audio_config: Option<AudioConfig>,
138     pub balloon: bool,
139     pub usb_config: UsbConfig,
140     pub dump_dt_fd: Option<File>,
141     pub enable_hypervisor_specific_auth_method: bool,
142     pub instance_id: [u8; 64],
143     // (memfd, guest address, size)
144     pub custom_memory_backing_files: Vec<(OwnedFd, u64, u64)>,
145     pub start_suspended: bool,
146 }
147 
148 #[derive(Debug)]
149 pub struct AudioConfig {
150     pub use_microphone: bool,
151     pub use_speaker: bool,
152 }
153 
154 impl AudioConfig {
new(raw_config: &AudioConfigParcelable) -> Self155     pub fn new(raw_config: &AudioConfigParcelable) -> Self {
156         AudioConfig { use_microphone: raw_config.useMicrophone, use_speaker: raw_config.useSpeaker }
157     }
158 }
159 
160 #[derive(Debug)]
161 pub struct UsbConfig {
162     pub controller: bool,
163 }
164 
165 impl UsbConfig {
new(raw_config: &UsbConfigParcelable) -> Result<UsbConfig>166     pub fn new(raw_config: &UsbConfigParcelable) -> Result<UsbConfig> {
167         Ok(UsbConfig { controller: raw_config.controller })
168     }
169 }
170 
171 #[derive(Debug)]
172 pub struct DisplayConfig {
173     pub width: NonZeroU32,
174     pub height: NonZeroU32,
175     pub horizontal_dpi: NonZeroU32,
176     pub vertical_dpi: NonZeroU32,
177     pub refresh_rate: NonZeroU32,
178 }
179 
180 impl DisplayConfig {
new(raw_config: &DisplayConfigParcelable) -> Result<DisplayConfig>181     pub fn new(raw_config: &DisplayConfigParcelable) -> Result<DisplayConfig> {
182         let width = try_into_non_zero_u32(raw_config.width)?;
183         let height = try_into_non_zero_u32(raw_config.height)?;
184         let horizontal_dpi = try_into_non_zero_u32(raw_config.horizontalDpi)?;
185         let vertical_dpi = try_into_non_zero_u32(raw_config.verticalDpi)?;
186         let refresh_rate = try_into_non_zero_u32(raw_config.refreshRate)?;
187         Ok(DisplayConfig { width, height, horizontal_dpi, vertical_dpi, refresh_rate })
188     }
189 }
190 
191 #[derive(Debug)]
192 pub struct GpuConfig {
193     pub backend: Option<String>,
194     pub context_types: Option<Vec<String>>,
195     pub pci_address: Option<String>,
196     pub renderer_features: Option<String>,
197     pub renderer_use_egl: Option<bool>,
198     pub renderer_use_gles: Option<bool>,
199     pub renderer_use_glx: Option<bool>,
200     pub renderer_use_surfaceless: Option<bool>,
201     pub renderer_use_vulkan: Option<bool>,
202 }
203 
204 impl GpuConfig {
new(raw_config: &GpuConfigParcelable) -> Result<GpuConfig>205     pub fn new(raw_config: &GpuConfigParcelable) -> Result<GpuConfig> {
206         Ok(GpuConfig {
207             backend: raw_config.backend.clone(),
208             context_types: raw_config.contextTypes.clone().map(|context_types| {
209                 context_types.iter().filter_map(|context_type| context_type.clone()).collect()
210             }),
211             pci_address: raw_config.pciAddress.clone(),
212             renderer_features: raw_config.rendererFeatures.clone(),
213             renderer_use_egl: Some(raw_config.rendererUseEgl),
214             renderer_use_gles: Some(raw_config.rendererUseGles),
215             renderer_use_glx: Some(raw_config.rendererUseGlx),
216             renderer_use_surfaceless: Some(raw_config.rendererUseSurfaceless),
217             renderer_use_vulkan: Some(raw_config.rendererUseVulkan),
218         })
219     }
220 }
221 
try_into_non_zero_u32(value: i32) -> Result<NonZeroU32>222 fn try_into_non_zero_u32(value: i32) -> Result<NonZeroU32> {
223     let u32_value = value.try_into()?;
224     NonZeroU32::new(u32_value).ok_or(anyhow!("value should be greater than 0"))
225 }
226 
227 /// A disk image to pass to crosvm for a VM.
228 #[derive(Debug)]
229 pub struct DiskFile {
230     pub image: File,
231     pub writable: bool,
232 }
233 
234 /// Shared path between host and guest VM.
235 #[derive(Debug)]
236 pub struct SharedPathConfig {
237     pub path: String,
238     pub host_uid: i32,
239     pub host_gid: i32,
240     pub guest_uid: i32,
241     pub guest_gid: i32,
242     pub mask: i32,
243     pub tag: String,
244     pub socket_path: String,
245     pub socket_fd: Option<File>,
246     pub app_domain: bool,
247 }
248 
249 /// virtio-input device configuration from `external/crosvm/src/crosvm/config.rs`
250 #[derive(Debug)]
251 #[allow(dead_code)]
252 pub enum InputDeviceOption {
253     EvDev(File),
254     SingleTouch { file: File, width: u32, height: u32, name: Option<String> },
255     Keyboard(File),
256     Mouse(File),
257     Switches(File),
258     MultiTouchTrackpad { file: File, width: u32, height: u32, name: Option<String> },
259     MultiTouch { file: File, width: u32, height: u32, name: Option<String> },
260 }
261 
262 type VfioDevice = Strong<dyn IBoundDevice>;
263 
264 /// The lifecycle state which the payload in the VM has reported itself to be in.
265 ///
266 /// Note that the order of enum variants is significant; only forward transitions are allowed by
267 /// [`VmInstance::update_payload_state`].
268 #[derive(Copy, Clone, Debug, Eq, Ord, PartialEq, PartialOrd)]
269 pub enum PayloadState {
270     Starting,
271     Started,
272     Ready,
273     Finished,
274     Hangup, // Hasn't reached to Ready before timeout expires
275 }
276 
277 /// The current state of the VM itself.
278 #[derive(Debug)]
279 pub enum VmState {
280     /// The VM has not yet tried to start.
281     NotStarted {
282         ///The configuration needed to start the VM, if it has not yet been started.
283         config: Box<CrosvmConfig>,
284     },
285     /// The VM has been started.
286     Running {
287         /// The crosvm child process.
288         child: Arc<SharedChild>,
289         /// The thread waiting for crosvm to finish.
290         monitor_vm_exit_thread: Option<JoinHandle<()>>,
291     },
292     /// The VM died or was killed.
293     Dead,
294     /// The VM failed to start.
295     Failed,
296 }
297 
298 /// RSS values of VM and CrosVM process itself.
299 #[derive(Copy, Clone, Debug, Default)]
300 pub struct Rss {
301     pub vm: i64,
302     pub crosvm: i64,
303 }
304 
305 /// Metrics regarding the VM.
306 #[derive(Debug, Default)]
307 pub struct VmMetric {
308     /// Recorded timestamp when the VM is started.
309     pub start_timestamp: Option<SystemTime>,
310     /// Update most recent guest_time periodically from /proc/[crosvm pid]/stat while VM is
311     /// running.
312     pub cpu_guest_time: Option<i64>,
313     /// Update maximum RSS values periodically from /proc/[crosvm pid]/smaps while VM is running.
314     pub rss: Option<Rss>,
315 }
316 
317 impl VmState {
318     /// Tries to start the VM, if it is in the `NotStarted` state.
319     ///
320     /// Returns an error if the VM is in the wrong state, or fails to start.
start(&mut self, instance: Arc<VmInstance>) -> Result<(), Error>321     fn start(&mut self, instance: Arc<VmInstance>) -> Result<(), Error> {
322         let state = mem::replace(self, VmState::Failed);
323         if let VmState::NotStarted { config } = state {
324             let config = *config;
325             let detect_hangup = config.detect_hangup;
326             let (failure_pipe_read, failure_pipe_write) = create_pipe()?;
327             let vfio_devices = config.vfio_devices.clone();
328             let tap =
329                 if let Some(tap_file) = &config.tap { Some(tap_file.try_clone()?) } else { None };
330 
331             let vhost_fs_devices = run_virtiofs(&config)?;
332 
333             // If this fails and returns an error, `self` will be left in the `Failed` state.
334             let child =
335                 Arc::new(run_vm(config, &instance.crosvm_control_socket_path, failure_pipe_write)?);
336 
337             let instance_monitor_status = instance.clone();
338             let child_monitor_status = child.clone();
339             thread::spawn(move || {
340                 instance_monitor_status.clone().monitor_vm_status(child_monitor_status);
341             });
342 
343             let child_clone = child.clone();
344             let instance_clone = instance.clone();
345             let monitor_vm_exit_thread = Some(thread::spawn(move || {
346                 instance_clone.monitor_vm_exit(
347                     child_clone,
348                     failure_pipe_read,
349                     vfio_devices,
350                     tap,
351                     vhost_fs_devices,
352                 );
353             }));
354 
355             if detect_hangup {
356                 let child_clone = child.clone();
357                 thread::spawn(move || {
358                     instance.monitor_payload_hangup(child_clone);
359                 });
360             }
361 
362             // If it started correctly, update the state.
363             *self = VmState::Running { child, monitor_vm_exit_thread };
364             Ok(())
365         } else {
366             *self = state;
367             bail!("VM already started or failed")
368         }
369     }
370 }
371 
372 /// Internal struct that holds the handles to globally unique resources of a VM.
373 #[derive(Debug)]
374 pub struct VmContext {
375     #[allow(dead_code)] // Keeps the global context alive
376     pub(crate) global_context: Strong<dyn IGlobalVmContext>,
377     #[allow(dead_code)] // Keeps the server alive
378     vm_server: Option<RpcServer>,
379 }
380 
381 impl VmContext {
382     /// Construct new VmContext.
new( global_context: Strong<dyn IGlobalVmContext>, vm_server: Option<RpcServer>, ) -> VmContext383     pub fn new(
384         global_context: Strong<dyn IGlobalVmContext>,
385         vm_server: Option<RpcServer>,
386     ) -> VmContext {
387         VmContext { global_context, vm_server }
388     }
389 }
390 
391 /// Information about a particular instance of a VM which may be running.
392 #[derive(Debug)]
393 pub struct VmInstance {
394     /// The current state of the VM.
395     pub vm_state: Mutex<VmState>,
396     /// Global resources allocated for this VM.
397     #[allow(dead_code)] // Keeps the context alive
398     pub(crate) vm_context: VmContext,
399     /// The CID assigned to the VM for vsock communication.
400     pub cid: Cid,
401     /// Path to crosvm control socket
402     crosvm_control_socket_path: PathBuf,
403     /// The name of the VM.
404     pub name: String,
405     /// Whether the VM is a protected VM.
406     pub protected: bool,
407     /// Directory of temporary files used by the VM while it is running.
408     pub temporary_directory: PathBuf,
409     /// The UID of the process which requested the VM.
410     pub requester_uid: u32,
411     /// The PID of the process which requested the VM. Note that this process may no longer exist
412     /// and the PID may have been reused for a different process, so this should not be trusted.
413     pub requester_debug_pid: i32,
414     /// Callbacks to clients of the VM.
415     pub callbacks: VirtualMachineCallbacks,
416     /// VirtualMachineService binder object for the VM.
417     #[allow(dead_code)]
418     pub vm_service: Mutex<Option<Strong<dyn IVirtualMachineService>>>,
419     /// Recorded metrics of VM such as timestamp or cpu / memory usage.
420     pub vm_metric: Mutex<VmMetric>,
421     // Whether virtio-balloon is enabled
422     pub balloon_enabled: bool,
423     /// List of vendor tee services this VM might access.
424     pub vendor_tee_services: Vec<String>,
425     /// The latest lifecycle state which the payload reported itself to be in.
426     payload_state: Mutex<PayloadState>,
427     /// Represents the condition that payload_state was updated
428     payload_state_updated: Condvar,
429     /// The human readable name of requester_uid
430     requester_uid_name: String,
431 }
432 
433 impl fmt::Display for VmInstance {
fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result434     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
435         let adj = if self.protected { "Protected" } else { "Non-protected" };
436         write!(
437             f,
438             "{} virtual machine \"{}\" (owner: {}, cid: {})",
439             adj, self.name, self.requester_uid_name, self.cid
440         )
441     }
442 }
443 
444 impl VmInstance {
445     /// Validates the given config and creates a new `VmInstance` but doesn't start running it.
new( config: CrosvmConfig, temporary_directory: PathBuf, requester_uid: u32, requester_debug_pid: i32, vm_context: VmContext, vendor_tee_services: Vec<String>, ) -> Result<VmInstance, Error>446     pub fn new(
447         config: CrosvmConfig,
448         temporary_directory: PathBuf,
449         requester_uid: u32,
450         requester_debug_pid: i32,
451         vm_context: VmContext,
452         vendor_tee_services: Vec<String>,
453     ) -> Result<VmInstance, Error> {
454         validate_config(&config)?;
455         let cid = config.cid;
456         let name = config.name.clone();
457         let protected = config.protected;
458         let balloon_enabled = config.balloon;
459         let requester_uid_name = User::from_uid(Uid::from_raw(requester_uid))
460             .ok()
461             .flatten()
462             .map_or_else(|| format!("{}", requester_uid), |u| u.name);
463         let instance = VmInstance {
464             vm_state: Mutex::new(VmState::NotStarted { config: Box::new(config) }),
465             vm_context,
466             cid,
467             crosvm_control_socket_path: temporary_directory.join("crosvm.sock"),
468             name,
469             protected,
470             temporary_directory,
471             requester_uid,
472             requester_debug_pid,
473             callbacks: Default::default(),
474             vm_service: Mutex::new(None),
475             vm_metric: Mutex::new(Default::default()),
476             payload_state: Mutex::new(PayloadState::Starting),
477             payload_state_updated: Condvar::new(),
478             requester_uid_name,
479             balloon_enabled,
480             vendor_tee_services,
481         };
482         info!("{} created", &instance);
483         Ok(instance)
484     }
485 
486     /// Starts an instance of `crosvm` to manage the VM. The `crosvm` instance will be killed when
487     /// the `VmInstance` is dropped.
start(self: &Arc<Self>) -> Result<(), Error>488     pub fn start(self: &Arc<Self>) -> Result<(), Error> {
489         let mut vm_metric = self.vm_metric.lock().unwrap();
490         vm_metric.start_timestamp = Some(SystemTime::now());
491         let ret = self.vm_state.lock().unwrap().start(self.clone());
492         if ret.is_ok() {
493             info!("{} started", &self);
494         }
495         ret.with_context(|| format!("{} failed to start", &self))
496     }
497 
498     /// Monitors the exit of the VM (i.e. termination of the `child` process). When that happens,
499     /// handles the event by updating the state, noityfing the event to clients by calling
500     /// callbacks, and removing temporary files for the VM.
monitor_vm_exit( &self, child: Arc<SharedChild>, failure_pipe_read: File, vfio_devices: Vec<VfioDevice>, tap: Option<File>, vhost_user_devices: Vec<SharedChild>, )501     fn monitor_vm_exit(
502         &self,
503         child: Arc<SharedChild>,
504         failure_pipe_read: File,
505         vfio_devices: Vec<VfioDevice>,
506         tap: Option<File>,
507         vhost_user_devices: Vec<SharedChild>,
508     ) {
509         let failure_reason_thread = std::thread::spawn(move || {
510             // Read the pipe to see if any failure reason is written
511             let mut failure_reason = String::new();
512             // Arbitrary max size in case of misbehaving guest.
513             const MAX_SIZE: u64 = 50_000;
514             match failure_pipe_read.take(MAX_SIZE).read_to_string(&mut failure_reason) {
515                 Err(e) => error!("Error reading VM failure reason from pipe: {}", e),
516                 Ok(len) if len > 0 => error!("VM returned failure reason '{}'", &failure_reason),
517                 _ => (),
518             };
519             failure_reason
520         });
521 
522         let result = child.wait();
523         match &result {
524             Err(e) => error!("Error waiting for crosvm({}) instance to die: {}", child.id(), e),
525             Ok(status) => {
526                 info!("crosvm({}) exited with status {}", child.id(), status);
527                 if let Some(exit_status_code) = status.code() {
528                     if exit_status_code == CROSVM_WATCHDOG_REBOOT_STATUS {
529                         info!("detected vcpu stall on crosvm");
530                     }
531                 }
532             }
533         }
534 
535         // In crosvm, when vhost_user frontend is dead, vhost_user backend device will detect and
536         // exit. We can safely wait() for vhost user device after waiting crosvm main
537         // process.
538         for device in vhost_user_devices {
539             match device.wait() {
540                 Ok(status) => {
541                     info!("Vhost user device({}) exited with status {}", device.id(), status);
542                     if !status.success() {
543                         if let Some(code) = status.code() {
544                             // vhost_user backend device exit with error code
545                             error!(
546                                 "vhost user device({}) exited with error code: {}",
547                                 device.id(),
548                                 code
549                             );
550                         } else {
551                             // The spawned child process of vhost_user backend device is
552                             // killed by signal
553                             error!("vhost user device({}) killed by signal", device.id());
554                         }
555                     }
556                 }
557                 Err(e) => {
558                     error!("Error waiting for vhost user device({}) to die: {}", device.id(), e);
559                 }
560             }
561         }
562 
563         let failure_reason = failure_reason_thread.join().expect("failure_reason_thread panic'd");
564 
565         let mut vm_state = self.vm_state.lock().unwrap();
566         *vm_state = VmState::Dead;
567         // Ensure that the mutex is released before calling the callbacks.
568         drop(vm_state);
569         info!("{} exited", &self);
570 
571         // In case of hangup, the pipe doesn't give us any information because the hangup can't be
572         // detected on the VM side (otherwise, it isn't a hangup), but in the
573         // monitor_payload_hangup function below which updates the payload state to Hangup.
574         let failure_reason =
575             if failure_reason.is_empty() && self.payload_state() == PayloadState::Hangup {
576                 Cow::from("HANGUP")
577             } else {
578                 Cow::from(failure_reason)
579             };
580 
581         self.handle_ramdump().unwrap_or_else(|e| error!("Error handling ramdump: {}", e));
582 
583         let death_reason = death_reason(&result, &failure_reason);
584         let exit_signal = exit_signal(&result);
585 
586         self.callbacks.callback_on_died(self.cid, death_reason);
587 
588         let vm_metric = self.vm_metric.lock().unwrap();
589         write_vm_exited_stats_sync(
590             self.requester_uid as i32,
591             &self.name,
592             death_reason,
593             exit_signal,
594             &vm_metric,
595         );
596 
597         // Delete temporary files. The folder itself is removed by VirtualizationServiceInternal.
598         remove_temporary_files(&self.temporary_directory).unwrap_or_else(|e| {
599             error!("Error removing temporary files from {:?}: {}", self.temporary_directory, e);
600         });
601 
602         if let Some(tap_file) = tap {
603             GLOBAL_SERVICE
604                 .deleteTapInterface(&ParcelFileDescriptor::new(OwnedFd::from(tap_file)))
605                 .unwrap_or_else(|e| {
606                     error!("Error deleting TAP interface: {e:?}");
607                 });
608         }
609 
610         drop(vfio_devices); // Cleanup devices.
611     }
612 
613     /// Waits until payload is started, or timeout expires. When timeout occurs, kill
614     /// the VM to prevent indefinite hangup and update the payload_state accordingly.
monitor_payload_hangup(&self, child: Arc<SharedChild>)615     fn monitor_payload_hangup(&self, child: Arc<SharedChild>) {
616         debug!("Starting to monitor hangup for Microdroid({})", child.id());
617         let (state, result) = self
618             .payload_state_updated
619             .wait_timeout_while(self.payload_state.lock().unwrap(), *BOOT_HANGUP_TIMEOUT, |s| {
620                 *s < PayloadState::Started
621             })
622             .unwrap();
623         drop(state); // we are not interested in state
624         let child_still_running = child.try_wait().ok() == Some(None);
625         if result.timed_out() && child_still_running {
626             error!(
627                 "Microdroid({}) failed to start payload within {} secs timeout. Shutting down.",
628                 child.id(),
629                 BOOT_HANGUP_TIMEOUT.as_secs()
630             );
631             self.update_payload_state(PayloadState::Hangup).unwrap();
632             if let Err(e) = self.kill() {
633                 error!("Error stopping timed-out VM with CID {}: {:?}", child.id(), e);
634             }
635         }
636     }
637 
monitor_vm_status(&self, child: Arc<SharedChild>)638     fn monitor_vm_status(&self, child: Arc<SharedChild>) {
639         let pid = child.id();
640         let mut metric_countdown = 0;
641 
642         loop {
643             {
644                 // Check VM state
645                 let vm_state = &*self.vm_state.lock().unwrap();
646                 if let VmState::Dead = vm_state {
647                     break;
648                 }
649 
650                 if metric_countdown > 0 {
651                     metric_countdown -= 1;
652                 } else {
653                     metric_countdown = 10;
654                     let mut vm_metric = self.vm_metric.lock().unwrap();
655 
656                     // Get CPU Information
657                     match get_guest_time(pid) {
658                         Ok(guest_time) => vm_metric.cpu_guest_time = Some(guest_time),
659                         Err(e) => {
660                             metric_countdown = 0;
661                             warn!("Failed to get guest CPU time: {}", e);
662                         }
663                     }
664 
665                     // Get Memory Information
666                     match get_rss(pid) {
667                         Ok(rss) => {
668                             vm_metric.rss = match &vm_metric.rss {
669                                 Some(x) => Some(Rss::extract_max(x, &rss)),
670                                 None => Some(rss),
671                             }
672                         }
673                         Err(e) => {
674                             metric_countdown = 0;
675                             warn!("Failed to get guest RSS: {}", e);
676                         }
677                     }
678                 }
679             }
680 
681             thread::sleep(Duration::from_secs(1));
682         }
683     }
684 
is_vm_running(&self) -> bool685     fn is_vm_running(&self) -> bool {
686         matches!(&*self.vm_state.lock().unwrap(), VmState::Running { .. })
687     }
688 
689     /// Returns the last reported state of the VM payload.
payload_state(&self) -> PayloadState690     pub fn payload_state(&self) -> PayloadState {
691         *self.payload_state.lock().unwrap()
692     }
693 
694     /// Updates the payload state to the given value, if it is a valid state transition.
update_payload_state(&self, new_state: PayloadState) -> Result<(), Error>695     pub fn update_payload_state(&self, new_state: PayloadState) -> Result<(), Error> {
696         let mut state_locked = self.payload_state.lock().unwrap();
697         // Only allow forward transitions, e.g. from starting to started or finished, not back in
698         // the other direction.
699         if new_state > *state_locked {
700             *state_locked = new_state;
701             self.payload_state_updated.notify_all();
702             Ok(())
703         } else {
704             bail!("Invalid payload state transition from {:?} to {:?}", *state_locked, new_state)
705         }
706     }
707 
708     /// Kills the crosvm instance, if it is running.
kill(&self) -> Result<(), Error>709     pub fn kill(&self) -> Result<(), Error> {
710         let monitor_vm_exit_thread = {
711             let vm_state = &mut *self.vm_state.lock().unwrap();
712             if let VmState::Running { child, monitor_vm_exit_thread } = vm_state {
713                 let id = child.id();
714                 debug!("Killing crosvm({})", id);
715                 // TODO: Talk to crosvm to shutdown cleanly.
716                 child.kill().with_context(|| format!("Error killing crosvm({id}) instance"))?;
717                 monitor_vm_exit_thread.take()
718             } else {
719                 bail!("VM is not running")
720             }
721         };
722 
723         // Wait for monitor_vm_exit() to finish. Must release vm_state lock
724         // first, as monitor_vm_exit() takes it as well.
725         monitor_vm_exit_thread.map(JoinHandle::join);
726 
727         // Now that the VM has been killed, shut down the VirtualMachineService
728         // server to eagerly free up the server threads.
729         if let Some(vm_server) = &self.vm_context.vm_server {
730             vm_server.shutdown()?;
731         }
732 
733         Ok(())
734     }
735 
736     /// Returns current virtio-balloon size.
get_memory_balloon(&self) -> Result<u64, Error>737     pub fn get_memory_balloon(&self) -> Result<u64, Error> {
738         if !self.is_vm_running() {
739             bail!("get_memory_balloon when VM is not running");
740         }
741         if !self.balloon_enabled {
742             bail!("virtio-balloon is not enabled");
743         }
744         let socket_path_cstring = path_to_cstring(&self.crosvm_control_socket_path);
745         let mut balloon_actual = 0u64;
746         // SAFETY: Pointers are valid for the lifetime of the call. Null `stats` is valid.
747         let success = unsafe {
748             crosvm_control::crosvm_client_balloon_stats(
749                 socket_path_cstring.as_ptr(),
750                 /* stats= */ std::ptr::null_mut(),
751                 &mut balloon_actual,
752             )
753         };
754         if !success {
755             bail!("Error requesting balloon stats");
756         }
757         Ok(balloon_actual)
758     }
759 
760     /// Inflates the virtio-balloon by `num_bytes` to reclaim guest memory. Called in response to
761     /// memory-trimming notifications.
set_memory_balloon(&self, num_bytes: u64) -> Result<(), Error>762     pub fn set_memory_balloon(&self, num_bytes: u64) -> Result<(), Error> {
763         if !self.is_vm_running() {
764             bail!("set_memory_balloon when VM is not running");
765         }
766         if !self.balloon_enabled {
767             bail!("virtio-balloon is not enabled");
768         }
769         let socket_path_cstring = path_to_cstring(&self.crosvm_control_socket_path);
770         // SAFETY: Pointer is valid for the lifetime of the call.
771         let success = unsafe {
772             crosvm_control::crosvm_client_balloon_vms(socket_path_cstring.as_ptr(), num_bytes)
773         };
774         if !success {
775             bail!("Error sending balloon adjustment");
776         }
777         Ok(())
778     }
779 
780     /// Checks if ramdump has been created. If so, send it to tombstoned.
handle_ramdump(&self) -> Result<(), Error>781     fn handle_ramdump(&self) -> Result<(), Error> {
782         let ramdump_path = self.temporary_directory.join("ramdump");
783         if !ramdump_path.as_path().try_exists()? {
784             return Ok(());
785         }
786         if std::fs::metadata(&ramdump_path)?.len() > 0 {
787             Self::send_ramdump_to_tombstoned(&ramdump_path)?;
788         }
789         Ok(())
790     }
791 
send_ramdump_to_tombstoned(ramdump_path: &Path) -> Result<(), Error>792     fn send_ramdump_to_tombstoned(ramdump_path: &Path) -> Result<(), Error> {
793         let mut input = File::open(ramdump_path)
794             .context(format!("Failed to open ramdump {:?} for reading", ramdump_path))?;
795 
796         let pid = std::process::id() as i32;
797         let conn = TombstonedConnection::connect(pid, DebuggerdDumpType::Tombstone)
798             .context("Failed to connect to tombstoned")?;
799         let mut output = conn
800             .text_output
801             .as_ref()
802             .ok_or_else(|| anyhow!("Could not get file to write the tombstones on"))?;
803 
804         std::io::copy(&mut input, &mut output).context("Failed to send ramdump to tombstoned")?;
805         info!("Ramdump {:?} sent to tombstoned", ramdump_path);
806 
807         conn.notify_completion()?;
808         Ok(())
809     }
810 
811     /// Suspends the VM's vCPUs.
suspend(&self) -> Result<(), Error>812     pub fn suspend(&self) -> Result<(), Error> {
813         let socket_path_cstring = path_to_cstring(&self.crosvm_control_socket_path);
814         // SAFETY: Pointer is valid for the lifetime of the call.
815         let success =
816             unsafe { crosvm_control::crosvm_client_suspend_vm(socket_path_cstring.as_ptr()) };
817         if !success {
818             bail!("Failed to suspend VM");
819         }
820         Ok(())
821     }
822 
823     /// Resumes the VM's vCPUs.
resume(&self) -> Result<(), Error>824     pub fn resume(&self) -> Result<(), Error> {
825         let socket_path_cstring = path_to_cstring(&self.crosvm_control_socket_path);
826         // SAFETY: Pointer is valid for the lifetime of the call.
827         let success =
828             unsafe { crosvm_control::crosvm_client_resume_vm(socket_path_cstring.as_ptr()) };
829         if !success {
830             bail!("Failed to resume VM");
831         }
832         Ok(())
833     }
834 
835     /// Performs full resume of VM.
resume_full(&self) -> Result<(), Error>836     pub fn resume_full(&self) -> Result<(), Error> {
837         let socket_path_cstring = path_to_cstring(&self.crosvm_control_socket_path);
838         // SAFETY: Pointer is valid for the lifetime of the call.
839         let success =
840             unsafe { crosvm_control::crosvm_client_resume_vm_full(socket_path_cstring.as_ptr()) };
841         if !success {
842             bail!("Failed to resume VM");
843         }
844         Ok(())
845     }
846 }
847 
848 impl Rss {
extract_max(x: &Rss, y: &Rss) -> Rss849     fn extract_max(x: &Rss, y: &Rss) -> Rss {
850         Rss { vm: max(x.vm, y.vm), crosvm: max(x.crosvm, y.crosvm) }
851     }
852 }
853 
854 // Get Cpus_allowed mask
check_if_all_cpus_allowed() -> Result<bool>855 fn check_if_all_cpus_allowed() -> Result<bool> {
856     let file = read_to_string("/proc/self/status")?;
857     let lines: Vec<_> = file.split('\n').collect();
858 
859     for line in lines {
860         if line.contains("Cpus_allowed_list") {
861             let prop: Vec<_> = line.split_whitespace().collect();
862             if prop.len() != 2 {
863                 return Ok(false);
864             }
865             let cpu_list: Vec<_> = prop[1].split('-').collect();
866             //Only contiguous Cpu list allowed
867             if cpu_list.len() != 2 {
868                 return Ok(false);
869             }
870             if let Some(cpus) = get_num_cpus() {
871                 let max_cpu = cpu_list[1].parse::<usize>()?;
872                 if max_cpu == cpus - 1 {
873                     return Ok(true);
874                 } else {
875                     return Ok(false);
876                 }
877             }
878         }
879     }
880     Ok(false)
881 }
882 
883 // Get guest time from /proc/[crosvm pid]/stat
get_guest_time(pid: u32) -> Result<i64>884 fn get_guest_time(pid: u32) -> Result<i64> {
885     let file = read_to_string(format!("/proc/{}/stat", pid))?;
886     let data_list: Vec<_> = file.split_whitespace().collect();
887 
888     // Information about guest_time is at 43th place of the file split with the whitespace.
889     // Example of /proc/[pid]/stat :
890     // 6603 (kworker/104:1H-kblockd) I 2 0 0 0 -1 69238880 0 0 0 0 0 88 0 0 0 -20 1 0 1845 0 0
891     // 18446744073709551615 0 0 0 0 0 0 0 2147483647 0 0 0 0 17 104 0 0 0 0 0 0 0 0 0 0 0 0 0
892     if data_list.len() < 43 {
893         bail!("Failed to parse command result for getting guest time : {}", file);
894     }
895 
896     let guest_time_ticks = data_list[42].parse::<i64>()?;
897     if guest_time_ticks == 0 {
898         bail!("zero value is measured on elapsed CPU guest_time");
899     }
900     // SAFETY: It just returns an integer about CPU tick information.
901     let ticks_per_sec = unsafe { sysconf(_SC_CLK_TCK) };
902     Ok(guest_time_ticks * MILLIS_PER_SEC / ticks_per_sec)
903 }
904 
905 // Get rss from /proc/[crosvm pid]/smaps
get_rss(pid: u32) -> Result<Rss>906 fn get_rss(pid: u32) -> Result<Rss> {
907     let file = read_to_string(format!("/proc/{}/smaps", pid))?;
908     let lines: Vec<_> = file.split('\n').collect();
909 
910     let mut rss_vm_total = 0i64;
911     let mut rss_crosvm_total = 0i64;
912     let mut is_vm = false;
913     for line in lines {
914         if line.contains("crosvm_guest") {
915             is_vm = true;
916         } else if line.contains("Rss:") {
917             let data_list: Vec<_> = line.split_whitespace().collect();
918             if data_list.len() < 2 {
919                 bail!("Failed to parse command result for getting rss :\n{}", line);
920             }
921             let rss = data_list[1].parse::<i64>()?;
922 
923             if is_vm {
924                 rss_vm_total += rss;
925                 is_vm = false;
926             }
927             rss_crosvm_total += rss;
928         }
929     }
930     if rss_crosvm_total == 0 {
931         bail!("zero value is measured on RSS of crosvm");
932     }
933     if rss_vm_total == 0 {
934         bail!("zero value is measured on RSS of VM");
935     }
936     Ok(Rss { vm: rss_vm_total, crosvm: rss_crosvm_total })
937 }
938 
death_reason(result: &Result<ExitStatus, io::Error>, mut failure_reason: &str) -> DeathReason939 fn death_reason(result: &Result<ExitStatus, io::Error>, mut failure_reason: &str) -> DeathReason {
940     if let Some((reason, info)) = failure_reason.split_once('|') {
941         // Separator indicates extra context information is present after the failure name.
942         error!("Failure info: {info}");
943         failure_reason = reason;
944     }
945     if let Ok(status) = result {
946         match failure_reason {
947             "PVM_FIRMWARE_PUBLIC_KEY_MISMATCH" => {
948                 return DeathReason::PVM_FIRMWARE_PUBLIC_KEY_MISMATCH
949             }
950             "PVM_FIRMWARE_INSTANCE_IMAGE_CHANGED" => {
951                 return DeathReason::PVM_FIRMWARE_INSTANCE_IMAGE_CHANGED
952             }
953             "MICRODROID_FAILED_TO_CONNECT_TO_VIRTUALIZATION_SERVICE" => {
954                 return DeathReason::MICRODROID_FAILED_TO_CONNECT_TO_VIRTUALIZATION_SERVICE
955             }
956             "MICRODROID_PAYLOAD_HAS_CHANGED" => return DeathReason::MICRODROID_PAYLOAD_HAS_CHANGED,
957             "MICRODROID_PAYLOAD_VERIFICATION_FAILED" => {
958                 return DeathReason::MICRODROID_PAYLOAD_VERIFICATION_FAILED
959             }
960             "MICRODROID_INVALID_PAYLOAD_CONFIG" => {
961                 return DeathReason::MICRODROID_INVALID_PAYLOAD_CONFIG
962             }
963             "MICRODROID_UNKNOWN_RUNTIME_ERROR" => {
964                 return DeathReason::MICRODROID_UNKNOWN_RUNTIME_ERROR
965             }
966             "HANGUP" => return DeathReason::HANGUP,
967             _ => {}
968         }
969         match status.code() {
970             None => DeathReason::KILLED,
971             Some(0) => DeathReason::SHUTDOWN,
972             Some(CROSVM_START_ERROR_STATUS) => DeathReason::START_FAILED,
973             Some(CROSVM_REBOOT_STATUS) => DeathReason::REBOOT,
974             Some(CROSVM_CRASH_STATUS) => DeathReason::CRASH,
975             Some(CROSVM_WATCHDOG_REBOOT_STATUS) => DeathReason::WATCHDOG_REBOOT,
976             Some(_) => DeathReason::UNKNOWN,
977         }
978     } else {
979         DeathReason::INFRASTRUCTURE_ERROR
980     }
981 }
982 
exit_signal(result: &Result<ExitStatus, io::Error>) -> Option<i32>983 fn exit_signal(result: &Result<ExitStatus, io::Error>) -> Option<i32> {
984     match result {
985         Ok(status) => status.signal(),
986         Err(_) => None,
987     }
988 }
989 
990 const SYSFS_PLATFORM_DEVICES_PATH: &str = "/sys/devices/platform/";
991 const VFIO_PLATFORM_DRIVER_PATH: &str = "/sys/bus/platform/drivers/vfio-platform";
992 
vfio_argument_for_platform_device(device: &VfioDevice) -> Result<String, Error>993 fn vfio_argument_for_platform_device(device: &VfioDevice) -> Result<String, Error> {
994     // Check platform device exists
995     let path = Path::new(&device.getSysfsPath()?).canonicalize()?;
996     if !path.starts_with(SYSFS_PLATFORM_DEVICES_PATH) {
997         bail!("{path:?} is not a platform device");
998     }
999 
1000     // Check platform device is bound to VFIO driver
1001     let dev_driver_path = path.join("driver").canonicalize()?;
1002     if dev_driver_path != Path::new(VFIO_PLATFORM_DRIVER_PATH) {
1003         bail!("{path:?} is not bound to VFIO-platform driver");
1004     }
1005 
1006     if let Some(p) = path.to_str() {
1007         Ok(format!("--vfio={p},iommu=pkvm-iommu,dt-symbol={0}", device.getDtboLabel()?))
1008     } else {
1009         bail!("invalid path {path:?}");
1010     }
1011 }
1012 
run_virtiofs(config: &CrosvmConfig) -> io::Result<Vec<SharedChild>>1013 fn run_virtiofs(config: &CrosvmConfig) -> io::Result<Vec<SharedChild>> {
1014     let mut devices: Vec<SharedChild> = Vec::new();
1015     for shared_path in &config.shared_paths {
1016         if shared_path.app_domain {
1017             continue;
1018         }
1019         let ugid_map_value = format!(
1020             "{} {} {} {} {} /",
1021             shared_path.guest_uid,
1022             shared_path.guest_gid,
1023             shared_path.host_uid,
1024             shared_path.host_gid,
1025             shared_path.mask,
1026         );
1027 
1028         let cfg_arg = format!("ugid_map='{}'", ugid_map_value);
1029 
1030         let mut command = Command::new(CROSVM_PATH);
1031         command
1032             .arg("device")
1033             .arg("fs")
1034             .arg(format!("--socket={}", &shared_path.socket_path))
1035             .arg(format!("--tag={}", &shared_path.tag))
1036             .arg(format!("--shared-dir={}", &shared_path.path))
1037             .arg("--cfg")
1038             .arg(cfg_arg.as_str())
1039             .arg("--disable-sandbox")
1040             .arg("--skip-pivot-root=true");
1041 
1042         print_crosvm_args(&command);
1043 
1044         let result = SharedChild::spawn(&mut command)?;
1045         info!("Spawned virtiofs crosvm({})", result.id());
1046         devices.push(result);
1047     }
1048 
1049     Ok(devices)
1050 }
1051 
1052 /// Starts an instance of `crosvm` to manage a new VM.
run_vm( config: CrosvmConfig, crosvm_control_socket_path: &Path, failure_pipe_write: File, ) -> Result<SharedChild, Error>1053 fn run_vm(
1054     config: CrosvmConfig,
1055     crosvm_control_socket_path: &Path,
1056     failure_pipe_write: File,
1057 ) -> Result<SharedChild, Error> {
1058     validate_config(&config)?;
1059 
1060     let mut command = Command::new(CROSVM_PATH);
1061 
1062     let vm_name = "crosvm_".to_owned() + &config.name;
1063     command.arg0(vm_name.clone());
1064     // TODO(qwandor): Remove --disable-sandbox.
1065     command
1066         .arg("--extended-status")
1067         // Configure the logger for the crosvm process to silence logs from the disk crate which
1068         // don't provide much information to us (but do spamming us).
1069         .arg("--log-level")
1070         .arg("info,disk=warn")
1071         .arg("run")
1072         .arg("--name")
1073         .arg(vm_name)
1074         .arg("--disable-sandbox")
1075         .arg("--cid")
1076         .arg(config.cid.to_string());
1077 
1078     if config.balloon {
1079         command.arg("--balloon-page-reporting");
1080     } else {
1081         command.arg("--no-balloon");
1082     }
1083 
1084     if !config.usb_config.controller {
1085         command.arg("--no-usb");
1086     }
1087 
1088     let mut memory_mib = config.memory_mib;
1089 
1090     if config.enable_hypervisor_specific_auth_method && !config.protected {
1091         bail!("hypervisor specific auth method only supported for protected VMs");
1092     }
1093     if config.protected {
1094         if config.enable_hypervisor_specific_auth_method {
1095             if !hypervisor_props::is_gunyah()? {
1096                 bail!("hypervisor specific auth method not supported for current hypervisor");
1097             }
1098             // "QCOM Trusted VM" compatibility mode.
1099             //
1100             // When this mode is enabled, two hypervisor specific IDs are expected to be packed
1101             // into the instance ID. We extract them here and pass along to crosvm so they can be
1102             // given to the hypervisor driver via an ioctl.
1103             let pas_id = u32::from_le_bytes(config.instance_id[60..64].try_into().unwrap());
1104             let vm_id = u16::from_le_bytes(config.instance_id[58..60].try_into().unwrap());
1105             command.arg("--hypervisor").arg(
1106                 format!("gunyah[device=/dev/gunyah,qcom_trusted_vm_id={vm_id},qcom_trusted_vm_pas_id={pas_id}]"),
1107             );
1108             // Put the FDT close to the payload (default is end of RAM) to so that CMA can be used
1109             // without bloating memory usage.
1110             command.arg("--fdt-position").arg("after-payload");
1111         }
1112 
1113         match system_properties::read(SYSPROP_CUSTOM_PVMFW_PATH)? {
1114             Some(pvmfw_path) if !pvmfw_path.is_empty() => {
1115                 if pvmfw_path == "none" {
1116                     command.arg("--protected-vm-without-firmware")
1117                 } else {
1118                     command.arg("--protected-vm-with-firmware").arg(pvmfw_path)
1119                 }
1120             }
1121             _ => command.arg("--protected-vm"),
1122         };
1123 
1124         let swiotlb_size_mib = config.swiotlb_mib.map(u32::from).unwrap_or({
1125             // 3 virtio-console devices + vsock = 4.
1126             // TODO: Count more device types, like balloon, input, and sound.
1127             let virtio_pci_device_count = 4 + config.disks.len();
1128             // crosvm virtio queue has 256 entries, so 2 MiB per device (2 pages per entry) should
1129             // be enough.
1130             // NOTE: The above explanation isn't completely accurate, e.g., circa 2024q4, each
1131             // virtio-block has 16 queues with 256 entries each and each virito-console has 2
1132             // queues of 256 entries each. So, it is allocating less than 2 pages per entry, but
1133             // seems to work well enough in practice.
1134             2 * virtio_pci_device_count as u32
1135         });
1136         command.arg("--swiotlb").arg(swiotlb_size_mib.to_string());
1137 
1138         // b/346770542 for consistent "usable" memory across protected and non-protected VMs.
1139         memory_mib = memory_mib.saturating_add(swiotlb_size_mib);
1140 
1141         // Workaround to keep crash_dump from trying to read protected guest memory.
1142         // Context in b/238324526.
1143         command.arg("--unmap-guest-memory-on-fork");
1144 
1145         if config.ramdump.is_some() {
1146             // Protected VM needs to reserve memory for ramdump here. Note that we reserve more
1147             // memory for the restricted dma pool.
1148             let ramdump_reserve = RAMDUMP_RESERVED_MIB + swiotlb_size_mib;
1149             command.arg("--params").arg(format!("crashkernel={ramdump_reserve}M"));
1150         }
1151     } else if config.ramdump.is_some() {
1152         command.arg("--params").arg(format!("crashkernel={RAMDUMP_RESERVED_MIB}M"));
1153     }
1154     if config.debug_config.debug_level == DebugLevel::NONE
1155         && config.debug_config.should_prepare_console_output()
1156     {
1157         // bootconfig.normal will be used, but we need log.
1158         command.arg("--params").arg("printk.devkmsg=on");
1159         command.arg("--params").arg("console=hvc0");
1160     }
1161 
1162     // Move the PCI MMIO regions to near the end of the low-MMIO space.
1163     // This is done to accommodate a limitation in a partner's hypervisor.
1164     #[cfg(target_arch = "aarch64")]
1165     command
1166         .arg("--pci")
1167         .arg("mem=[start=0x2c000000,size=0x2000000],cam=[start=0x2e000000,size=0x1000000]");
1168 
1169     command.arg("--mem").arg(memory_mib.to_string());
1170 
1171     fn cpu_arg_command(command: &mut Command, count: usize) {
1172         #[cfg(target_arch = "aarch64")]
1173         command.arg("--cpus").arg(count.to_string() + ",sve=[auto=true]");
1174         #[cfg(not(target_arch = "aarch64"))]
1175         command.arg("--cpus").arg(count.to_string());
1176     }
1177     match config.cpus.cpuTopology {
1178         CpuTopology::MatchHost(_) => {
1179             if cfg!(virt_cpufreq) && check_if_all_cpus_allowed()? {
1180                 command.arg("--host-cpu-topology");
1181                 #[cfg(target_arch = "aarch64")]
1182                 {
1183                     command.arg("--virt-cpufreq");
1184                     command.arg("--cpus").arg("sve=[auto=true]");
1185                 }
1186             } else {
1187                 cpu_arg_command(
1188                     &mut command,
1189                     get_num_cpus()
1190                         .context("Could not determine the number of CPUs in the system")?,
1191                 )
1192             }
1193         }
1194         CpuTopology::CpuCount(count) => {
1195             cpu_arg_command(&mut command, count.try_into().context("invalid cpu count")?)
1196         }
1197     }
1198 
1199     if let Some(gdb_port) = config.gdb_port {
1200         command.arg("--gdb").arg(gdb_port.to_string());
1201     }
1202 
1203     // Keep track of what file descriptors should be mapped to the crosvm process.
1204     let mut preserved_fds = config.indirect_files.into_iter().map(|f| f.into()).collect();
1205 
1206     if let Some(dump_dt_fd) = config.dump_dt_fd {
1207         let dump_dt_fd = add_preserved_fd(&mut preserved_fds, dump_dt_fd);
1208         command.arg("--dump-device-tree-blob").arg(dump_dt_fd);
1209     }
1210 
1211     // Setup the serial devices.
1212     // 1. uart device: used as the output device by bootloaders and as early console by linux
1213     // 2. uart device: used to report the reason for the VM failing.
1214     // 3. virtio-console device: used as the console device where kmsg is redirected to
1215     // 4. virtio-console device: used as the ramdump output
1216     // 5. virtio-console device: used as the logcat output
1217     //
1218     // When [console|log]_fd is not specified, the devices are attached to sink, which means what's
1219     // written there is discarded.
1220     let console_out_arg = format_serial_out_arg(&mut preserved_fds, config.console_out_fd);
1221     let console_in_arg = config
1222         .console_in_fd
1223         .map(|fd| format!(",input={}", add_preserved_fd(&mut preserved_fds, fd)))
1224         .unwrap_or_default();
1225     let log_arg = format_serial_out_arg(&mut preserved_fds, config.log_fd);
1226     let failure_serial_path = add_preserved_fd(&mut preserved_fds, failure_pipe_write);
1227     let ramdump_arg = format_serial_out_arg(&mut preserved_fds, config.ramdump);
1228     let console_input_device = config.console_input_device.as_deref().unwrap_or(CONSOLE_HVC0);
1229     match console_input_device {
1230         CONSOLE_HVC0 | CONSOLE_TTYS0 => {}
1231         _ => bail!("Unsupported serial device {console_input_device}"),
1232     };
1233 
1234     // Warning: Adding more serial devices requires you to shift the PCI device ID of the boot
1235     // disks in bootconfig.x86_64. This is because x86 crosvm puts serial devices and the block
1236     // devices in the same PCI bus and serial devices comes before the block devices. Arm crosvm
1237     // doesn't have the issue.
1238     // /dev/ttyS0
1239     command.arg(format!(
1240         "--serial={}{},hardware=serial,num=1",
1241         &console_out_arg,
1242         if console_input_device == CONSOLE_TTYS0 { &console_in_arg } else { "" }
1243     ));
1244     // /dev/ttyS1
1245     command.arg(format!("--serial=type=file,path={},hardware=serial,num=2", &failure_serial_path));
1246     // /dev/hvc0
1247     command.arg(format!(
1248         "--serial={}{},hardware=virtio-console,num=1",
1249         &console_out_arg,
1250         if console_input_device == CONSOLE_HVC0 { &console_in_arg } else { "" }
1251     ));
1252     // /dev/hvc1
1253     command.arg(format!("--serial={},hardware=virtio-console,num=2", &ramdump_arg));
1254     // /dev/hvc2
1255     command.arg(format!("--serial={},hardware=virtio-console,num=3", &log_arg));
1256 
1257     if let Some(bootloader) = config.bootloader {
1258         command.arg("--bios").arg(add_preserved_fd(&mut preserved_fds, bootloader));
1259     }
1260 
1261     if let Some(initrd) = config.initrd {
1262         command.arg("--initrd").arg(add_preserved_fd(&mut preserved_fds, initrd));
1263     }
1264 
1265     if let Some(params) = &config.params {
1266         command.arg("--params").arg(params);
1267     }
1268 
1269     for disk in config.disks {
1270         // Disk file locking is disabled because of missing SELinux policies.
1271         command.arg("--block").arg(format!(
1272             "path={},ro={},lock=false",
1273             add_preserved_fd(&mut preserved_fds, disk.image),
1274             !disk.writable,
1275         ));
1276     }
1277 
1278     if let Some(kernel) = config.kernel {
1279         command.arg(add_preserved_fd(&mut preserved_fds, kernel));
1280     }
1281 
1282     #[cfg(target_arch = "aarch64")]
1283     command.arg("--no-pmu");
1284 
1285     let control_sock = create_crosvm_control_listener(crosvm_control_socket_path)
1286         .context("failed to create control listener")?;
1287     command.arg("--socket").arg(add_preserved_fd(&mut preserved_fds, control_sock));
1288 
1289     config.device_tree_overlays.into_iter().for_each(|dt_overlay| {
1290         let arg = add_preserved_fd(&mut preserved_fds, dt_overlay);
1291         command.arg("--device-tree-overlay").arg(arg);
1292     });
1293 
1294     if cfg!(paravirtualized_devices) {
1295         if let Some(gpu_config) = &config.gpu_config {
1296             let mut gpu_args = Vec::new();
1297             if let Some(backend) = &gpu_config.backend {
1298                 gpu_args.push(format!("backend={}", backend));
1299             }
1300             if let Some(context_types) = &gpu_config.context_types {
1301                 gpu_args.push(format!("context-types={}", context_types.join(":")));
1302             }
1303             if let Some(pci_address) = &gpu_config.pci_address {
1304                 gpu_args.push(format!("pci-address={}", pci_address));
1305             }
1306             if let Some(renderer_features) = &gpu_config.renderer_features {
1307                 gpu_args.push(format!("renderer-features={}", renderer_features));
1308             }
1309             if gpu_config.renderer_use_egl.unwrap_or(false) {
1310                 gpu_args.push("egl=true".to_string());
1311             }
1312             if gpu_config.renderer_use_gles.unwrap_or(false) {
1313                 gpu_args.push("gles=true".to_string());
1314             }
1315             if gpu_config.renderer_use_glx.unwrap_or(false) {
1316                 gpu_args.push("glx=true".to_string());
1317             }
1318             if gpu_config.renderer_use_surfaceless.unwrap_or(false) {
1319                 gpu_args.push("surfaceless=true".to_string());
1320             }
1321             if gpu_config.renderer_use_vulkan.unwrap_or(false) {
1322                 gpu_args.push("vulkan=true".to_string());
1323             }
1324             command.arg(format!("--gpu={}", gpu_args.join(",")));
1325         }
1326         if let Some(display_config) = &config.display_config {
1327             command
1328                 .arg(format!(
1329                     "--gpu-display=mode=windowed[{},{}],dpi=[{},{}],refresh-rate={}",
1330                     display_config.width,
1331                     display_config.height,
1332                     display_config.horizontal_dpi,
1333                     display_config.vertical_dpi,
1334                     display_config.refresh_rate
1335                 ))
1336                 .arg(format!("--android-display-service={}", config.name));
1337         }
1338     }
1339 
1340     if cfg!(network) {
1341         if let Some(tap) = config.tap {
1342             add_preserved_fd(&mut preserved_fds, tap);
1343             let tap_fd = preserved_fds.last().unwrap().as_raw_fd();
1344             command.arg("--net").arg(format!("tap-fd={tap_fd}"));
1345         }
1346     }
1347 
1348     if cfg!(paravirtualized_devices) {
1349         for input_device_option in config.input_device_options.into_iter() {
1350             command.arg("--input");
1351             command.arg(match input_device_option {
1352                 InputDeviceOption::EvDev(file) => {
1353                     format!("evdev[path={}]", add_preserved_fd(&mut preserved_fds, file))
1354                 }
1355                 InputDeviceOption::Keyboard(file) => {
1356                     format!("keyboard[path={}]", add_preserved_fd(&mut preserved_fds, file))
1357                 }
1358                 InputDeviceOption::Mouse(file) => {
1359                     format!("mouse[path={}]", add_preserved_fd(&mut preserved_fds, file))
1360                 }
1361                 InputDeviceOption::SingleTouch { file, width, height, name } => format!(
1362                     "single-touch[path={},width={},height={}{}]",
1363                     add_preserved_fd(&mut preserved_fds, file),
1364                     width,
1365                     height,
1366                     name.as_ref().map_or("".into(), |n| format!(",name={}", n))
1367                 ),
1368                 InputDeviceOption::Switches(file) => {
1369                     format!("switches[path={}]", add_preserved_fd(&mut preserved_fds, file))
1370                 }
1371                 InputDeviceOption::MultiTouchTrackpad { file, width, height, name } => format!(
1372                     "multi-touch-trackpad[path={},width={},height={}{}]",
1373                     add_preserved_fd(&mut preserved_fds, file),
1374                     width,
1375                     height,
1376                     name.as_ref().map_or("".into(), |n| format!(",name={}", n))
1377                 ),
1378                 InputDeviceOption::MultiTouch { file, width, height, name } => format!(
1379                     "multi-touch[path={},width={},height={}{}]",
1380                     add_preserved_fd(&mut preserved_fds, file),
1381                     width,
1382                     height,
1383                     name.as_ref().map_or("".into(), |n| format!(",name={}", n))
1384                 ),
1385             });
1386         }
1387     }
1388 
1389     if config.hugepages {
1390         command.arg("--hugepages");
1391     }
1392 
1393     if config.boost_uclamp {
1394         command.arg("--boost-uclamp");
1395     }
1396 
1397     if !config.vfio_devices.is_empty() {
1398         if let Some(dtbo) = config.dtbo {
1399             command.arg(format!(
1400                 "--device-tree-overlay={},filter",
1401                 add_preserved_fd(&mut preserved_fds, dtbo)
1402             ));
1403         } else {
1404             bail!("VFIO devices assigned but no DTBO available");
1405         }
1406     };
1407     for device in config.vfio_devices {
1408         command.arg(vfio_argument_for_platform_device(&device)?);
1409     }
1410 
1411     for shared_path in &config.shared_paths {
1412         if shared_path.app_domain {
1413             if let Some(socket_fd) = &shared_path.socket_fd {
1414                 let socket_path =
1415                     add_preserved_fd(&mut preserved_fds, socket_fd.try_clone().unwrap());
1416                 let raw_fd: i32 = socket_path.rsplit_once('/').unwrap().1.parse().unwrap();
1417                 command
1418                     .arg("--vhost-user-fs")
1419                     .arg(format!("tag={},socket-fd={}", &shared_path.tag, raw_fd));
1420             }
1421         } else {
1422             if let Err(e) = wait_for_file(&shared_path.socket_path, 5) {
1423                 bail!("Error waiting for file: {}", e);
1424             }
1425             command
1426                 .arg("--vhost-user-fs")
1427                 .arg(format!("{},tag={}", &shared_path.socket_path, &shared_path.tag));
1428         }
1429     }
1430 
1431     for (fd, addr, size) in config.custom_memory_backing_files {
1432         command.arg("--file-backed-mapping").arg(format!(
1433             "{},addr={addr:#0x},size={size:#0x},rw,ram",
1434             add_preserved_fd(&mut preserved_fds, fd)
1435         ));
1436     }
1437 
1438     debug!("Preserving FDs {:?}", preserved_fds);
1439     command.preserved_fds(preserved_fds);
1440 
1441     if cfg!(paravirtualized_devices) {
1442         if let Some(audio_config) = &config.audio_config {
1443             command.arg("--virtio-snd").arg(format!(
1444                 "backend=aaudio,num_input_devices={},num_output_devices={}",
1445                 if audio_config.use_microphone { 1 } else { 0 },
1446                 if audio_config.use_speaker { 1 } else { 0 }
1447             ));
1448         }
1449     }
1450 
1451     if config.start_suspended {
1452         command.arg("--suspended");
1453     }
1454 
1455     print_crosvm_args(&command);
1456 
1457     let result = SharedChild::spawn(&mut command)?;
1458     debug!("Spawned crosvm({}).", result.id());
1459     Ok(result)
1460 }
1461 
wait_for_file(path: &str, timeout_secs: u64) -> Result<(), std::io::Error>1462 fn wait_for_file(path: &str, timeout_secs: u64) -> Result<(), std::io::Error> {
1463     let start_time = std::time::Instant::now();
1464     let timeout = Duration::from_secs(timeout_secs);
1465 
1466     while start_time.elapsed() < timeout {
1467         if std::fs::metadata(path).is_ok() {
1468             return Ok(()); // File exists
1469         }
1470         thread::sleep(Duration::from_millis(100));
1471     }
1472 
1473     Err(std::io::Error::new(
1474         std::io::ErrorKind::NotFound,
1475         format!("File not found within {} seconds: {}", timeout_secs, path),
1476     ))
1477 }
1478 
1479 /// Ensure that the configuration has a valid combination of fields set, or return an error if not.
validate_config(config: &CrosvmConfig) -> Result<(), Error>1480 fn validate_config(config: &CrosvmConfig) -> Result<(), Error> {
1481     if config.bootloader.is_none() && config.kernel.is_none() {
1482         bail!("VM must have either a bootloader or a kernel image.");
1483     }
1484     if config.bootloader.is_some() && (config.kernel.is_some() || config.initrd.is_some()) {
1485         bail!("Can't have both bootloader and kernel/initrd image.");
1486     }
1487     let version = Version::parse(CROSVM_PLATFORM_VERSION).unwrap();
1488     if !config.platform_version.matches(&version) {
1489         bail!(
1490             "Incompatible platform version. The config is compatible with platform version(s) \
1491               {}, but the actual platform version is {}",
1492             config.platform_version,
1493             version
1494         );
1495     }
1496 
1497     Ok(())
1498 }
1499 
1500 /// Print arguments of the crosvm command. In doing so, /proc/self/fd/XX is annotated with the
1501 /// actual file path if the FD is backed by a regular file. If not, the /proc path is printed
1502 /// unmodified.
print_crosvm_args(command: &Command)1503 fn print_crosvm_args(command: &Command) {
1504     let re = Regex::new(r"/proc/self/fd/[\d]+").unwrap();
1505     info!(
1506         "Running crosvm with args: {:?}",
1507         command
1508             .get_args()
1509             .map(|s| s.to_string_lossy())
1510             .map(|s| {
1511                 re.replace_all(&s, |caps: &Captures| {
1512                     let path = &caps[0];
1513                     if let Ok(realpath) = std::fs::canonicalize(path) {
1514                         format!("{} ({})", path, realpath.to_string_lossy())
1515                     } else {
1516                         path.to_owned()
1517                     }
1518                 })
1519                 .into_owned()
1520             })
1521             .collect::<Vec<_>>()
1522     );
1523 }
1524 
1525 /// Adds the file descriptor for `file` to `preserved_fds`, and returns a string of the form
1526 /// "/proc/self/fd/N" where N is the file descriptor.
add_preserved_fd<F: Into<OwnedFd>>(preserved_fds: &mut Vec<OwnedFd>, file: F) -> String1527 fn add_preserved_fd<F: Into<OwnedFd>>(preserved_fds: &mut Vec<OwnedFd>, file: F) -> String {
1528     let fd = file.into();
1529     let raw_fd = fd.as_raw_fd();
1530     preserved_fds.push(fd);
1531     format!("/proc/self/fd/{}", raw_fd)
1532 }
1533 
1534 /// Adds the file descriptor for `file` (if any) to `preserved_fds`, and returns the appropriate
1535 /// string for a crosvm `--serial` flag. If `file` is none, creates a dummy sink device.
format_serial_out_arg(preserved_fds: &mut Vec<OwnedFd>, file: Option<File>) -> String1536 fn format_serial_out_arg(preserved_fds: &mut Vec<OwnedFd>, file: Option<File>) -> String {
1537     if let Some(file) = file {
1538         format!("type=file,path={}", add_preserved_fd(preserved_fds, file))
1539     } else {
1540         "type=sink".to_string()
1541     }
1542 }
1543 
1544 /// Creates a new pipe with the `O_CLOEXEC` flag set, and returns the read side and write side.
create_pipe() -> Result<(File, File), Error>1545 fn create_pipe() -> Result<(File, File), Error> {
1546     let (read_fd, write_fd) = pipe2(OFlag::O_CLOEXEC)?;
1547     Ok((read_fd.into(), write_fd.into()))
1548 }
1549 
1550 /// Creates and binds a unix seqpacket listening socket to be passed as crosvm's `--socket`
1551 /// argument. See `UnixSeqpacketListener::bind` in crosvm's code for reference.
create_crosvm_control_listener(crosvm_control_socket_path: &Path) -> Result<OwnedFd>1552 fn create_crosvm_control_listener(crosvm_control_socket_path: &Path) -> Result<OwnedFd> {
1553     use nix::sys::socket;
1554     let fd = socket::socket(
1555         socket::AddressFamily::Unix,
1556         socket::SockType::SeqPacket,
1557         socket::SockFlag::empty(),
1558         None,
1559     )
1560     .context("socket failed")?;
1561     socket::bind(fd.as_raw_fd(), &socket::UnixAddr::new(crosvm_control_socket_path)?)
1562         .context("bind failed")?;
1563     // The exact backlog size isn't imporant. crosvm uses 128 internally. We use 127 here
1564     // because of a `nix` bug.
1565     socket::listen(&fd, socket::Backlog::new(127).unwrap()).context("listen failed")?;
1566     Ok(fd)
1567 }
1568 
path_to_cstring(path: &Path) -> CString1569 fn path_to_cstring(path: &Path) -> CString {
1570     if let Some(s) = path.to_str() {
1571         if let Ok(s) = CString::new(s) {
1572             return s;
1573         }
1574     }
1575     // The path contains invalid utf8 or a null, which should never happen.
1576     panic!("bad path: {path:?}");
1577 }
1578