1 // Copyright 2017 The Chromium OS Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 use std::cmp::{max, min, Reverse};
6 use std::convert::TryFrom;
7 #[cfg(feature = "gpu")]
8 use std::env;
9 use std::error::Error as StdError;
10 use std::ffi::CStr;
11 use std::fmt::{self, Display};
12 use std::fs::{File, OpenOptions};
13 use std::io::{self, stdin, Read};
14 use std::iter;
15 use std::mem;
16 use std::net::Ipv4Addr;
17 #[cfg(feature = "gpu")]
18 use std::num::NonZeroU8;
19 use std::num::ParseIntError;
20 use std::os::unix::io::FromRawFd;
21 use std::os::unix::net::UnixStream;
22 use std::path::{Path, PathBuf};
23 use std::ptr;
24 use std::str;
25 use std::sync::{mpsc, Arc, Barrier};
26
27 use std::thread;
28 use std::thread::JoinHandle;
29 use std::time::Duration;
30
31 use libc::{self, c_int, gid_t, uid_t};
32
33 use acpi_tables::sdt::SDT;
34
35 use base::net::{UnixSeqpacketListener, UnlinkUnixSeqpacketListener};
36 use base::*;
37 use devices::virtio::vhost::user::{
38 Block as VhostUserBlock, Error as VhostUserError, Fs as VhostUserFs, Net as VhostUserNet,
39 };
40 #[cfg(feature = "gpu")]
41 use devices::virtio::EventDevice;
42 use devices::virtio::{self, Console, VirtioDevice};
43 #[cfg(feature = "audio")]
44 use devices::Ac97Dev;
45 use devices::{
46 self, HostBackendDeviceProvider, IrqChip, IrqEventIndex, KvmKernelIrqChip, PciDevice,
47 VcpuRunState, VfioContainer, VfioDevice, VfioPciDevice, VirtioPciDevice, XhciController,
48 };
49 use hypervisor::kvm::{Kvm, KvmVcpu, KvmVm};
50 use hypervisor::{HypervisorCap, Vcpu, VcpuExit, VcpuRunHandle, Vm, VmCap};
51 use minijail::{self, Minijail};
52 use net_util::{Error as NetError, MacAddress, Tap};
53 use remain::sorted;
54 use resources::{Alloc, MmioType, SystemAllocator};
55 use rutabaga_gfx::RutabagaGralloc;
56 use sync::Mutex;
57 use vm_control::*;
58 use vm_memory::{GuestAddress, GuestMemory, MemoryPolicy};
59
60 #[cfg(all(target_arch = "x86_64", feature = "gdb"))]
61 use crate::gdb::{gdb_thread, GdbStub};
62 use crate::{
63 Config, DiskOption, Executable, SharedDir, SharedDirKind, TouchDeviceOption, VhostUserFsOption,
64 VhostUserOption,
65 };
66 use arch::{
67 self, LinuxArch, RunnableLinuxVm, SerialHardware, SerialParameters, VcpuAffinity,
68 VirtioDeviceStub, VmComponents, VmImage,
69 };
70
71 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
72 use {
73 aarch64::AArch64 as Arch,
74 devices::IrqChipAArch64 as IrqChipArch,
75 hypervisor::{VcpuAArch64 as VcpuArch, VmAArch64 as VmArch},
76 };
77 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
78 use {
79 devices::{IrqChipX86_64 as IrqChipArch, KvmSplitIrqChip},
80 hypervisor::{VcpuX86_64 as VcpuArch, VmX86_64 as VmArch},
81 x86_64::X8664arch as Arch,
82 };
83
84 #[sorted]
85 #[derive(Debug)]
86 pub enum Error {
87 AddGpuDeviceMemory(base::Error),
88 AddIrqChipVcpu(base::Error),
89 AddPmemDeviceMemory(base::Error),
90 AllocateGpuDeviceAddress,
91 AllocatePmemDeviceAddress(resources::Error),
92 BalloonActualTooLarge,
93 BalloonDeviceNew(virtio::BalloonError),
94 BlockDeviceNew(base::Error),
95 BlockSignal(base::signal::Error),
96 BuildVm(<Arch as LinuxArch>::Error),
97 ChownTpmStorage(base::Error),
98 CloneEvent(base::Error),
99 CloneVcpu(base::Error),
100 ConfigureVcpu(<Arch as LinuxArch>::Error),
101 #[cfg(feature = "audio")]
102 CreateAc97(devices::PciDeviceError),
103 CreateConsole(arch::serial::Error),
104 CreateControlServer(io::Error),
105 CreateDiskError(disk::Error),
106 CreateEvent(base::Error),
107 CreateGrallocError(rutabaga_gfx::RutabagaError),
108 CreateKvm(base::Error),
109 CreateSignalFd(base::SignalFdError),
110 CreateSocket(io::Error),
111 CreateTapDevice(NetError),
112 CreateTimer(base::Error),
113 CreateTpmStorage(PathBuf, io::Error),
114 CreateTube(TubeError),
115 CreateUsbProvider(devices::usb::host_backend::error::Error),
116 CreateVcpu(base::Error),
117 CreateVfioDevice(devices::vfio::VfioError),
118 CreateVm(base::Error),
119 CreateWaitContext(base::Error),
120 DeviceJail(minijail::Error),
121 DevicePivotRoot(minijail::Error),
122 #[cfg(feature = "direct")]
123 DirectIo(io::Error),
124 #[cfg(feature = "direct")]
125 DirectIrq(devices::DirectIrqError),
126 Disk(PathBuf, io::Error),
127 DiskImageLock(base::Error),
128 DropCapabilities(base::Error),
129 FsDeviceNew(virtio::fs::Error),
130 GetMaxOpenFiles(io::Error),
131 GetSignalMask(signal::Error),
132 GuestCachedMissing(),
133 GuestCachedTooLarge(std::num::TryFromIntError),
134 GuestFreeMissing(),
135 GuestFreeTooLarge(std::num::TryFromIntError),
136 GuestMemoryLayout(<Arch as LinuxArch>::Error),
137 #[cfg(all(target_arch = "x86_64", feature = "gdb"))]
138 HandleDebugCommand(<Arch as LinuxArch>::Error),
139 InputDeviceNew(virtio::InputError),
140 InputEventsOpen(std::io::Error),
141 InvalidFdPath,
142 InvalidWaylandPath,
143 IoJail(minijail::Error),
144 LoadKernel(Box<dyn StdError>),
145 MemoryTooLarge,
146 NetDeviceNew(virtio::NetError),
147 OpenAcpiTable(PathBuf, io::Error),
148 OpenAndroidFstab(PathBuf, io::Error),
149 OpenBios(PathBuf, io::Error),
150 OpenInitrd(PathBuf, io::Error),
151 OpenKernel(PathBuf, io::Error),
152 OpenVinput(PathBuf, io::Error),
153 P9DeviceNew(virtio::P9Error),
154 ParseMaxOpenFiles(ParseIntError),
155 PivotRootDoesntExist(&'static str),
156 PmemDeviceImageTooBig,
157 PmemDeviceNew(base::Error),
158 ReadMemAvailable(io::Error),
159 ReadStatm(io::Error),
160 RegisterBalloon(arch::DeviceRegistrationError),
161 RegisterBlock(arch::DeviceRegistrationError),
162 RegisterGpu(arch::DeviceRegistrationError),
163 RegisterNet(arch::DeviceRegistrationError),
164 RegisterP9(arch::DeviceRegistrationError),
165 RegisterRng(arch::DeviceRegistrationError),
166 RegisterSignalHandler(base::Error),
167 RegisterWayland(arch::DeviceRegistrationError),
168 ReserveGpuMemory(base::MmapError),
169 ReserveMemory(base::Error),
170 ReservePmemMemory(base::MmapError),
171 ResetTimer(base::Error),
172 RngDeviceNew(virtio::RngError),
173 RunnableVcpu(base::Error),
174 #[cfg(all(target_arch = "x86_64", feature = "gdb"))]
175 SendDebugStatus(Box<mpsc::SendError<VcpuDebugStatusMessage>>),
176 SettingGidMap(minijail::Error),
177 SettingMaxOpenFiles(minijail::Error),
178 SettingSignalMask(base::Error),
179 SettingUidMap(minijail::Error),
180 SignalFd(base::SignalFdError),
181 #[cfg(all(target_arch = "x86_64", feature = "gdb"))]
182 SpawnGdbServer(io::Error),
183 SpawnVcpu(io::Error),
184 Timer(base::Error),
185 ValidateRawDescriptor(base::Error),
186 VhostNetDeviceNew(virtio::vhost::Error),
187 VhostUserBlockDeviceNew(VhostUserError),
188 VhostUserFsDeviceNew(VhostUserError),
189 VhostUserNetDeviceNew(VhostUserError),
190 VhostUserNetWithNetArgs,
191 VhostVsockDeviceNew(virtio::vhost::Error),
192 VirtioPciDev(base::Error),
193 WaitContextAdd(base::Error),
194 WaitContextDelete(base::Error),
195 WaylandDeviceNew(base::Error),
196 }
197
198 impl Display for Error {
199 #[remain::check]
fmt(&self, f: &mut fmt::Formatter) -> fmt::Result200 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
201 use self::Error::*;
202
203 #[sorted]
204 match self {
205 AddGpuDeviceMemory(e) => write!(f, "failed to add gpu device memory: {}", e),
206 AddIrqChipVcpu(e) => write!(f, "failed to add vcpu to irq chip: {}", e),
207 AddPmemDeviceMemory(e) => write!(f, "failed to add pmem device memory: {}", e),
208 AllocateGpuDeviceAddress => write!(f, "failed to allocate gpu device guest address"),
209 AllocatePmemDeviceAddress(e) => {
210 write!(f, "failed to allocate memory for pmem device: {}", e)
211 }
212 BalloonActualTooLarge => write!(f, "balloon actual size is too large"),
213 BalloonDeviceNew(e) => write!(f, "failed to create balloon: {}", e),
214 BlockDeviceNew(e) => write!(f, "failed to create block device: {}", e),
215 BlockSignal(e) => write!(f, "failed to block signal: {}", e),
216 BuildVm(e) => write!(f, "The architecture failed to build the vm: {}", e),
217 ChownTpmStorage(e) => write!(f, "failed to chown tpm storage: {}", e),
218 CloneEvent(e) => write!(f, "failed to clone event: {}", e),
219 CloneVcpu(e) => write!(f, "failed to clone vcpu: {}", e),
220 ConfigureVcpu(e) => write!(f, "failed to configure vcpu: {}", e),
221 #[cfg(feature = "audio")]
222 CreateAc97(e) => write!(f, "failed to create ac97 device: {}", e),
223 CreateConsole(e) => write!(f, "failed to create console device: {}", e),
224 CreateControlServer(e) => write!(f, "failed to create control server: {}", e),
225 CreateDiskError(e) => write!(f, "failed to create virtual disk: {}", e),
226 CreateEvent(e) => write!(f, "failed to create event: {}", e),
227 CreateGrallocError(e) => write!(f, "failed to create gralloc: {}", e),
228 CreateKvm(e) => write!(f, "failed to create kvm: {}", e),
229 CreateSignalFd(e) => write!(f, "failed to create signalfd: {}", e),
230 CreateSocket(e) => write!(f, "failed to create socket: {}", e),
231 CreateTapDevice(e) => write!(f, "failed to create tap device: {}", e),
232 CreateTimer(e) => write!(f, "failed to create Timer: {}", e),
233 CreateTpmStorage(p, e) => {
234 write!(f, "failed to create tpm storage dir {}: {}", p.display(), e)
235 }
236 CreateTube(e) => write!(f, "failed to create tube: {}", e),
237 CreateUsbProvider(e) => write!(f, "failed to create usb provider: {}", e),
238 CreateVcpu(e) => write!(f, "failed to create vcpu: {}", e),
239 CreateVfioDevice(e) => write!(f, "Failed to create vfio device {}", e),
240 CreateVm(e) => write!(f, "failed to create vm: {}", e),
241 CreateWaitContext(e) => write!(f, "failed to create wait context: {}", e),
242 DeviceJail(e) => write!(f, "failed to jail device: {}", e),
243 DevicePivotRoot(e) => write!(f, "failed to pivot root device: {}", e),
244 #[cfg(feature = "direct")]
245 DirectIo(e) => write!(f, "failed to open direct io device: {}", e),
246 #[cfg(feature = "direct")]
247 DirectIrq(e) => write!(f, "failed to enable interrupt forwarding: {}", e),
248 Disk(p, e) => write!(f, "failed to load disk image {}: {}", p.display(), e),
249 DiskImageLock(e) => write!(f, "failed to lock disk image: {}", e),
250 DropCapabilities(e) => write!(f, "failed to drop process capabilities: {}", e),
251 FsDeviceNew(e) => write!(f, "failed to create fs device: {}", e),
252 GetMaxOpenFiles(e) => write!(f, "failed to get max number of open files: {}", e),
253 GetSignalMask(e) => write!(f, "failed to retrieve signal mask for vcpu: {}", e),
254 GuestCachedMissing() => write!(f, "guest cached is missing from balloon stats"),
255 GuestCachedTooLarge(e) => write!(f, "guest cached is too large: {}", e),
256 GuestFreeMissing() => write!(f, "guest free is missing from balloon stats"),
257 GuestFreeTooLarge(e) => write!(f, "guest free is too large: {}", e),
258 GuestMemoryLayout(e) => write!(f, "failed to create guest memory layout: {}", e),
259 #[cfg(all(target_arch = "x86_64", feature = "gdb"))]
260 HandleDebugCommand(e) => write!(f, "failed to handle a gdb command: {}", e),
261 InputDeviceNew(e) => write!(f, "failed to set up input device: {}", e),
262 InputEventsOpen(e) => write!(f, "failed to open event device: {}", e),
263 InvalidFdPath => write!(f, "failed parsing a /proc/self/fd/*"),
264 InvalidWaylandPath => write!(f, "wayland socket path has no parent or file name"),
265 IoJail(e) => write!(f, "{}", e),
266 LoadKernel(e) => write!(f, "failed to load kernel: {}", e),
267 MemoryTooLarge => write!(f, "requested memory size too large"),
268 NetDeviceNew(e) => write!(f, "failed to set up virtio networking: {}", e),
269 OpenAcpiTable(p, e) => write!(f, "failed to open ACPI file {}: {}", p.display(), e),
270 OpenAndroidFstab(p, e) => write!(
271 f,
272 "failed to open android fstab file {}: {}",
273 p.display(),
274 e
275 ),
276 OpenBios(p, e) => write!(f, "failed to open bios {}: {}", p.display(), e),
277 OpenInitrd(p, e) => write!(f, "failed to open initrd {}: {}", p.display(), e),
278 OpenKernel(p, e) => write!(f, "failed to open kernel image {}: {}", p.display(), e),
279 OpenVinput(p, e) => write!(f, "failed to open vinput device {}: {}", p.display(), e),
280 P9DeviceNew(e) => write!(f, "failed to create 9p device: {}", e),
281 ParseMaxOpenFiles(e) => write!(f, "failed to parse max number of open files: {}", e),
282 PivotRootDoesntExist(p) => write!(f, "{} doesn't exist, can't jail devices.", p),
283 PmemDeviceImageTooBig => {
284 write!(f, "failed to create pmem device: pmem device image too big")
285 }
286 PmemDeviceNew(e) => write!(f, "failed to create pmem device: {}", e),
287 ReadMemAvailable(e) => write!(
288 f,
289 "failed to read /sys/kernel/mm/chromeos-low_mem/available: {}",
290 e
291 ),
292 ReadStatm(e) => write!(f, "failed to read /proc/self/statm: {}", e),
293 RegisterBalloon(e) => write!(f, "error registering balloon device: {}", e),
294 RegisterBlock(e) => write!(f, "error registering block device: {}", e),
295 RegisterGpu(e) => write!(f, "error registering gpu device: {}", e),
296 RegisterNet(e) => write!(f, "error registering net device: {}", e),
297 RegisterP9(e) => write!(f, "error registering 9p device: {}", e),
298 RegisterRng(e) => write!(f, "error registering rng device: {}", e),
299 RegisterSignalHandler(e) => write!(f, "error registering signal handler: {}", e),
300 RegisterWayland(e) => write!(f, "error registering wayland device: {}", e),
301 ReserveGpuMemory(e) => write!(f, "failed to reserve gpu memory: {}", e),
302 ReserveMemory(e) => write!(f, "failed to reserve memory: {}", e),
303 ReservePmemMemory(e) => write!(f, "failed to reserve pmem memory: {}", e),
304 ResetTimer(e) => write!(f, "failed to reset Timer: {}", e),
305 RngDeviceNew(e) => write!(f, "failed to set up rng: {}", e),
306 RunnableVcpu(e) => write!(f, "failed to set thread id for vcpu: {}", e),
307 #[cfg(all(target_arch = "x86_64", feature = "gdb"))]
308 SendDebugStatus(e) => write!(f, "failed to send a debug status to GDB thread: {}", e),
309 SettingGidMap(e) => write!(f, "error setting GID map: {}", e),
310 SettingMaxOpenFiles(e) => write!(f, "error setting max open files: {}", e),
311 SettingSignalMask(e) => write!(f, "failed to set the signal mask for vcpu: {}", e),
312 SettingUidMap(e) => write!(f, "error setting UID map: {}", e),
313 SignalFd(e) => write!(f, "failed to read signal fd: {}", e),
314 #[cfg(all(target_arch = "x86_64", feature = "gdb"))]
315 SpawnGdbServer(e) => write!(f, "failed to spawn GDB thread: {}", e),
316 SpawnVcpu(e) => write!(f, "failed to spawn VCPU thread: {}", e),
317 Timer(e) => write!(f, "failed to read timer fd: {}", e),
318 ValidateRawDescriptor(e) => write!(f, "failed to validate raw descriptor: {}", e),
319 VhostNetDeviceNew(e) => write!(f, "failed to set up vhost networking: {}", e),
320 VhostUserBlockDeviceNew(e) => {
321 write!(f, "failed to set up vhost-user block device: {}", e)
322 }
323 VhostUserFsDeviceNew(e) => write!(f, "failed to set up vhost-user fs device: {}", e),
324 VhostUserNetDeviceNew(e) => write!(f, "failed to set up vhost-user net device: {}", e),
325 VhostUserNetWithNetArgs => write!(
326 f,
327 "vhost-user-net cannot be used with any of --host_ip, --netmask or --mac"
328 ),
329 VhostVsockDeviceNew(e) => write!(f, "failed to set up virtual socket device: {}", e),
330 VirtioPciDev(e) => write!(f, "failed to create virtio pci dev: {}", e),
331 WaitContextAdd(e) => write!(f, "failed to add descriptor to wait context: {}", e),
332 WaitContextDelete(e) => {
333 write!(f, "failed to remove descriptor from wait context: {}", e)
334 }
335 WaylandDeviceNew(e) => write!(f, "failed to create wayland device: {}", e),
336 }
337 }
338 }
339
340 impl From<minijail::Error> for Error {
from(err: minijail::Error) -> Self341 fn from(err: minijail::Error) -> Self {
342 Error::IoJail(err)
343 }
344 }
345
346 impl std::error::Error for Error {}
347
348 type Result<T> = std::result::Result<T, Error>;
349
350 enum TaggedControlTube {
351 Fs(Tube),
352 Vm(Tube),
353 VmMemory(Tube),
354 VmIrq(Tube),
355 VmMsync(Tube),
356 }
357
358 impl AsRef<Tube> for TaggedControlTube {
as_ref(&self) -> &Tube359 fn as_ref(&self) -> &Tube {
360 use self::TaggedControlTube::*;
361 match &self {
362 Fs(tube) | Vm(tube) | VmMemory(tube) | VmIrq(tube) | VmMsync(tube) => tube,
363 }
364 }
365 }
366
367 impl AsRawDescriptor for TaggedControlTube {
as_raw_descriptor(&self) -> RawDescriptor368 fn as_raw_descriptor(&self) -> RawDescriptor {
369 self.as_ref().as_raw_descriptor()
370 }
371 }
372
get_max_open_files() -> Result<u64>373 fn get_max_open_files() -> Result<u64> {
374 let mut buf = mem::MaybeUninit::<libc::rlimit64>::zeroed();
375
376 // Safe because this will only modify `buf` and we check the return value.
377 let res = unsafe { libc::prlimit64(0, libc::RLIMIT_NOFILE, ptr::null(), buf.as_mut_ptr()) };
378 if res == 0 {
379 // Safe because the kernel guarantees that the struct is fully initialized.
380 let limit = unsafe { buf.assume_init() };
381 Ok(limit.rlim_max)
382 } else {
383 Err(Error::GetMaxOpenFiles(io::Error::last_os_error()))
384 }
385 }
386
387 struct SandboxConfig<'a> {
388 limit_caps: bool,
389 log_failures: bool,
390 seccomp_policy: &'a Path,
391 uid_map: Option<&'a str>,
392 gid_map: Option<&'a str>,
393 }
394
create_base_minijail( root: &Path, r_limit: Option<u64>, config: Option<&SandboxConfig>, ) -> Result<Minijail>395 fn create_base_minijail(
396 root: &Path,
397 r_limit: Option<u64>,
398 config: Option<&SandboxConfig>,
399 ) -> Result<Minijail> {
400 // All child jails run in a new user namespace without any users mapped,
401 // they run as nobody unless otherwise configured.
402 let mut j = Minijail::new().map_err(Error::DeviceJail)?;
403
404 if let Some(config) = config {
405 j.namespace_pids();
406 j.namespace_user();
407 j.namespace_user_disable_setgroups();
408 if config.limit_caps {
409 // Don't need any capabilities.
410 j.use_caps(0);
411 }
412 if let Some(uid_map) = config.uid_map {
413 j.uidmap(uid_map).map_err(Error::SettingUidMap)?;
414 }
415 if let Some(gid_map) = config.gid_map {
416 j.gidmap(gid_map).map_err(Error::SettingGidMap)?;
417 }
418 // Run in a new mount namespace.
419 j.namespace_vfs();
420
421 // Run in an empty network namespace.
422 j.namespace_net();
423
424 // Don't allow the device to gain new privileges.
425 j.no_new_privs();
426
427 // By default we'll prioritize using the pre-compiled .bpf over the .policy
428 // file (the .bpf is expected to be compiled using "trap" as the failure
429 // behavior instead of the default "kill" behavior).
430 // Refer to the code comment for the "seccomp-log-failures"
431 // command-line parameter for an explanation about why the |log_failures|
432 // flag forces the use of .policy files (and the build-time alternative to
433 // this run-time flag).
434 let bpf_policy_file = config.seccomp_policy.with_extension("bpf");
435 if bpf_policy_file.exists() && !config.log_failures {
436 j.parse_seccomp_program(&bpf_policy_file)
437 .map_err(Error::DeviceJail)?;
438 } else {
439 // Use TSYNC only for the side effect of it using SECCOMP_RET_TRAP,
440 // which will correctly kill the entire device process if a worker
441 // thread commits a seccomp violation.
442 j.set_seccomp_filter_tsync();
443 if config.log_failures {
444 j.log_seccomp_filter_failures();
445 }
446 j.parse_seccomp_filters(&config.seccomp_policy.with_extension("policy"))
447 .map_err(Error::DeviceJail)?;
448 }
449 j.use_seccomp_filter();
450 // Don't do init setup.
451 j.run_as_init();
452 }
453
454 // Only pivot_root if we are not re-using the current root directory.
455 if root != Path::new("/") {
456 // It's safe to call `namespace_vfs` multiple times.
457 j.namespace_vfs();
458 j.enter_pivot_root(root).map_err(Error::DevicePivotRoot)?;
459 }
460
461 // Most devices don't need to open many fds.
462 let limit = if let Some(r) = r_limit { r } else { 1024u64 };
463 j.set_rlimit(libc::RLIMIT_NOFILE as i32, limit, limit)
464 .map_err(Error::SettingMaxOpenFiles)?;
465
466 Ok(j)
467 }
468
simple_jail(cfg: &Config, policy: &str) -> Result<Option<Minijail>>469 fn simple_jail(cfg: &Config, policy: &str) -> Result<Option<Minijail>> {
470 if cfg.sandbox {
471 let pivot_root: &str = option_env!("DEFAULT_PIVOT_ROOT").unwrap_or("/var/empty");
472 // A directory for a jailed device's pivot root.
473 let root_path = Path::new(pivot_root);
474 if !root_path.exists() {
475 return Err(Error::PivotRootDoesntExist(pivot_root));
476 }
477 let policy_path: PathBuf = cfg.seccomp_policy_dir.join(policy);
478 let config = SandboxConfig {
479 limit_caps: true,
480 log_failures: cfg.seccomp_log_failures,
481 seccomp_policy: &policy_path,
482 uid_map: None,
483 gid_map: None,
484 };
485 Ok(Some(create_base_minijail(root_path, None, Some(&config))?))
486 } else {
487 Ok(None)
488 }
489 }
490
491 type DeviceResult<T = VirtioDeviceStub> = std::result::Result<T, Error>;
492
create_block_device(cfg: &Config, disk: &DiskOption, disk_device_tube: Tube) -> DeviceResult493 fn create_block_device(cfg: &Config, disk: &DiskOption, disk_device_tube: Tube) -> DeviceResult {
494 // Special case '/proc/self/fd/*' paths. The FD is already open, just use it.
495 let raw_image: File = if disk.path.parent() == Some(Path::new("/proc/self/fd")) {
496 // Safe because we will validate |raw_fd|.
497 unsafe { File::from_raw_descriptor(raw_descriptor_from_path(&disk.path)?) }
498 } else {
499 OpenOptions::new()
500 .read(true)
501 .write(!disk.read_only)
502 .open(&disk.path)
503 .map_err(|e| Error::Disk(disk.path.to_path_buf(), e))?
504 };
505 // Lock the disk image to prevent other crosvm instances from using it.
506 let lock_op = if disk.read_only {
507 FlockOperation::LockShared
508 } else {
509 FlockOperation::LockExclusive
510 };
511 flock(&raw_image, lock_op, true).map_err(Error::DiskImageLock)?;
512
513 let dev = if disk::async_ok(&raw_image).map_err(Error::CreateDiskError)? {
514 let async_file = disk::create_async_disk_file(raw_image).map_err(Error::CreateDiskError)?;
515 Box::new(
516 virtio::BlockAsync::new(
517 virtio::base_features(cfg.protected_vm),
518 async_file,
519 disk.read_only,
520 disk.sparse,
521 disk.block_size,
522 disk.id,
523 Some(disk_device_tube),
524 )
525 .map_err(Error::BlockDeviceNew)?,
526 ) as Box<dyn VirtioDevice>
527 } else {
528 let disk_file = disk::create_disk_file(raw_image).map_err(Error::CreateDiskError)?;
529 Box::new(
530 virtio::Block::new(
531 virtio::base_features(cfg.protected_vm),
532 disk_file,
533 disk.read_only,
534 disk.sparse,
535 disk.block_size,
536 disk.id,
537 Some(disk_device_tube),
538 )
539 .map_err(Error::BlockDeviceNew)?,
540 ) as Box<dyn VirtioDevice>
541 };
542
543 Ok(VirtioDeviceStub {
544 dev,
545 jail: simple_jail(&cfg, "block_device")?,
546 })
547 }
548
create_vhost_user_block_device(cfg: &Config, opt: &VhostUserOption) -> DeviceResult549 fn create_vhost_user_block_device(cfg: &Config, opt: &VhostUserOption) -> DeviceResult {
550 let dev = VhostUserBlock::new(virtio::base_features(cfg.protected_vm), &opt.socket)
551 .map_err(Error::VhostUserBlockDeviceNew)?;
552
553 Ok(VirtioDeviceStub {
554 dev: Box::new(dev),
555 // no sandbox here because virtqueue handling is exported to a different process.
556 jail: None,
557 })
558 }
559
create_vhost_user_fs_device(cfg: &Config, option: &VhostUserFsOption) -> DeviceResult560 fn create_vhost_user_fs_device(cfg: &Config, option: &VhostUserFsOption) -> DeviceResult {
561 let dev = VhostUserFs::new(
562 virtio::base_features(cfg.protected_vm),
563 &option.socket,
564 &option.tag,
565 )
566 .map_err(Error::VhostUserFsDeviceNew)?;
567
568 Ok(VirtioDeviceStub {
569 dev: Box::new(dev),
570 // no sandbox here because virtqueue handling is exported to a different process.
571 jail: None,
572 })
573 }
574
create_rng_device(cfg: &Config) -> DeviceResult575 fn create_rng_device(cfg: &Config) -> DeviceResult {
576 let dev =
577 virtio::Rng::new(virtio::base_features(cfg.protected_vm)).map_err(Error::RngDeviceNew)?;
578
579 Ok(VirtioDeviceStub {
580 dev: Box::new(dev),
581 jail: simple_jail(&cfg, "rng_device")?,
582 })
583 }
584
585 #[cfg(feature = "tpm")]
create_tpm_device(cfg: &Config) -> DeviceResult586 fn create_tpm_device(cfg: &Config) -> DeviceResult {
587 use std::ffi::CString;
588 use std::fs;
589 use std::process;
590
591 let tpm_storage: PathBuf;
592 let mut tpm_jail = simple_jail(&cfg, "tpm_device")?;
593
594 match &mut tpm_jail {
595 Some(jail) => {
596 // Create a tmpfs in the device's root directory for tpm
597 // simulator storage. The size is 20*1024, or 20 KB.
598 jail.mount_with_data(
599 Path::new("none"),
600 Path::new("/"),
601 "tmpfs",
602 (libc::MS_NOSUID | libc::MS_NODEV | libc::MS_NOEXEC) as usize,
603 "size=20480",
604 )?;
605
606 let crosvm_ids = add_crosvm_user_to_jail(jail, "tpm")?;
607
608 let pid = process::id();
609 let tpm_pid_dir = format!("/run/vm/tpm.{}", pid);
610 tpm_storage = Path::new(&tpm_pid_dir).to_owned();
611 fs::create_dir_all(&tpm_storage)
612 .map_err(|e| Error::CreateTpmStorage(tpm_storage.to_owned(), e))?;
613 let tpm_pid_dir_c = CString::new(tpm_pid_dir).expect("no nul bytes");
614 chown(&tpm_pid_dir_c, crosvm_ids.uid, crosvm_ids.gid)
615 .map_err(Error::ChownTpmStorage)?;
616
617 jail.mount_bind(&tpm_storage, &tpm_storage, true)?;
618 }
619 None => {
620 // Path used inside cros_sdk which does not have /run/vm.
621 tpm_storage = Path::new("/tmp/tpm-simulator").to_owned();
622 }
623 }
624
625 let dev = virtio::Tpm::new(tpm_storage);
626
627 Ok(VirtioDeviceStub {
628 dev: Box::new(dev),
629 jail: tpm_jail,
630 })
631 }
632
create_single_touch_device(cfg: &Config, single_touch_spec: &TouchDeviceOption) -> DeviceResult633 fn create_single_touch_device(cfg: &Config, single_touch_spec: &TouchDeviceOption) -> DeviceResult {
634 let socket = single_touch_spec
635 .get_path()
636 .into_unix_stream()
637 .map_err(|e| {
638 error!("failed configuring virtio single touch: {:?}", e);
639 e
640 })?;
641
642 let (width, height) = single_touch_spec.get_size();
643 let dev = virtio::new_single_touch(
644 socket,
645 width,
646 height,
647 virtio::base_features(cfg.protected_vm),
648 )
649 .map_err(Error::InputDeviceNew)?;
650 Ok(VirtioDeviceStub {
651 dev: Box::new(dev),
652 jail: simple_jail(&cfg, "input_device")?,
653 })
654 }
655
create_multi_touch_device(cfg: &Config, multi_touch_spec: &TouchDeviceOption) -> DeviceResult656 fn create_multi_touch_device(cfg: &Config, multi_touch_spec: &TouchDeviceOption) -> DeviceResult {
657 let socket = multi_touch_spec
658 .get_path()
659 .into_unix_stream()
660 .map_err(|e| {
661 error!("failed configuring virtio multi touch: {:?}", e);
662 e
663 })?;
664
665 let (width, height) = multi_touch_spec.get_size();
666 let dev = virtio::new_multi_touch(
667 socket,
668 width,
669 height,
670 virtio::base_features(cfg.protected_vm),
671 )
672 .map_err(Error::InputDeviceNew)?;
673
674 Ok(VirtioDeviceStub {
675 dev: Box::new(dev),
676 jail: simple_jail(&cfg, "input_device")?,
677 })
678 }
679
create_trackpad_device(cfg: &Config, trackpad_spec: &TouchDeviceOption) -> DeviceResult680 fn create_trackpad_device(cfg: &Config, trackpad_spec: &TouchDeviceOption) -> DeviceResult {
681 let socket = trackpad_spec.get_path().into_unix_stream().map_err(|e| {
682 error!("failed configuring virtio trackpad: {}", e);
683 e
684 })?;
685
686 let (width, height) = trackpad_spec.get_size();
687 let dev = virtio::new_trackpad(
688 socket,
689 width,
690 height,
691 virtio::base_features(cfg.protected_vm),
692 )
693 .map_err(Error::InputDeviceNew)?;
694
695 Ok(VirtioDeviceStub {
696 dev: Box::new(dev),
697 jail: simple_jail(&cfg, "input_device")?,
698 })
699 }
700
create_mouse_device<T: IntoUnixStream>(cfg: &Config, mouse_socket: T) -> DeviceResult701 fn create_mouse_device<T: IntoUnixStream>(cfg: &Config, mouse_socket: T) -> DeviceResult {
702 let socket = mouse_socket.into_unix_stream().map_err(|e| {
703 error!("failed configuring virtio mouse: {}", e);
704 e
705 })?;
706
707 let dev = virtio::new_mouse(socket, virtio::base_features(cfg.protected_vm))
708 .map_err(Error::InputDeviceNew)?;
709
710 Ok(VirtioDeviceStub {
711 dev: Box::new(dev),
712 jail: simple_jail(&cfg, "input_device")?,
713 })
714 }
715
create_keyboard_device<T: IntoUnixStream>(cfg: &Config, keyboard_socket: T) -> DeviceResult716 fn create_keyboard_device<T: IntoUnixStream>(cfg: &Config, keyboard_socket: T) -> DeviceResult {
717 let socket = keyboard_socket.into_unix_stream().map_err(|e| {
718 error!("failed configuring virtio keyboard: {}", e);
719 e
720 })?;
721
722 let dev = virtio::new_keyboard(socket, virtio::base_features(cfg.protected_vm))
723 .map_err(Error::InputDeviceNew)?;
724
725 Ok(VirtioDeviceStub {
726 dev: Box::new(dev),
727 jail: simple_jail(&cfg, "input_device")?,
728 })
729 }
730
create_switches_device<T: IntoUnixStream>(cfg: &Config, switches_socket: T) -> DeviceResult731 fn create_switches_device<T: IntoUnixStream>(cfg: &Config, switches_socket: T) -> DeviceResult {
732 let socket = switches_socket.into_unix_stream().map_err(|e| {
733 error!("failed configuring virtio switches: {}", e);
734 e
735 })?;
736
737 let dev = virtio::new_switches(socket, virtio::base_features(cfg.protected_vm))
738 .map_err(Error::InputDeviceNew)?;
739
740 Ok(VirtioDeviceStub {
741 dev: Box::new(dev),
742 jail: simple_jail(&cfg, "input_device")?,
743 })
744 }
745
create_vinput_device(cfg: &Config, dev_path: &Path) -> DeviceResult746 fn create_vinput_device(cfg: &Config, dev_path: &Path) -> DeviceResult {
747 let dev_file = OpenOptions::new()
748 .read(true)
749 .write(true)
750 .open(dev_path)
751 .map_err(|e| Error::OpenVinput(dev_path.to_owned(), e))?;
752
753 let dev = virtio::new_evdev(dev_file, virtio::base_features(cfg.protected_vm))
754 .map_err(Error::InputDeviceNew)?;
755
756 Ok(VirtioDeviceStub {
757 dev: Box::new(dev),
758 jail: simple_jail(&cfg, "input_device")?,
759 })
760 }
761
create_balloon_device(cfg: &Config, tube: Tube) -> DeviceResult762 fn create_balloon_device(cfg: &Config, tube: Tube) -> DeviceResult {
763 let dev = virtio::Balloon::new(virtio::base_features(cfg.protected_vm), tube)
764 .map_err(Error::BalloonDeviceNew)?;
765
766 Ok(VirtioDeviceStub {
767 dev: Box::new(dev),
768 jail: simple_jail(&cfg, "balloon_device")?,
769 })
770 }
771
create_tap_net_device(cfg: &Config, tap_fd: RawDescriptor) -> DeviceResult772 fn create_tap_net_device(cfg: &Config, tap_fd: RawDescriptor) -> DeviceResult {
773 // Safe because we ensure that we get a unique handle to the fd.
774 let tap = unsafe {
775 Tap::from_raw_descriptor(
776 validate_raw_descriptor(tap_fd).map_err(Error::ValidateRawDescriptor)?,
777 )
778 .map_err(Error::CreateTapDevice)?
779 };
780
781 let mut vq_pairs = cfg.net_vq_pairs.unwrap_or(1);
782 let vcpu_count = cfg.vcpu_count.unwrap_or(1);
783 if vcpu_count < vq_pairs as usize {
784 error!("net vq pairs must be smaller than vcpu count, fall back to single queue mode");
785 vq_pairs = 1;
786 }
787 let features = virtio::base_features(cfg.protected_vm);
788 let dev = virtio::Net::from(features, tap, vq_pairs).map_err(Error::NetDeviceNew)?;
789
790 Ok(VirtioDeviceStub {
791 dev: Box::new(dev),
792 jail: simple_jail(&cfg, "net_device")?,
793 })
794 }
795
create_net_device( cfg: &Config, host_ip: Ipv4Addr, netmask: Ipv4Addr, mac_address: MacAddress, mem: &GuestMemory, ) -> DeviceResult796 fn create_net_device(
797 cfg: &Config,
798 host_ip: Ipv4Addr,
799 netmask: Ipv4Addr,
800 mac_address: MacAddress,
801 mem: &GuestMemory,
802 ) -> DeviceResult {
803 let mut vq_pairs = cfg.net_vq_pairs.unwrap_or(1);
804 let vcpu_count = cfg.vcpu_count.unwrap_or(1);
805 if vcpu_count < vq_pairs as usize {
806 error!("net vq pairs must be smaller than vcpu count, fall back to single queue mode");
807 vq_pairs = 1;
808 }
809
810 let features = virtio::base_features(cfg.protected_vm);
811 let dev = if cfg.vhost_net {
812 let dev = virtio::vhost::Net::<Tap, vhost::Net<Tap>>::new(
813 &cfg.vhost_net_device_path,
814 features,
815 host_ip,
816 netmask,
817 mac_address,
818 mem,
819 )
820 .map_err(Error::VhostNetDeviceNew)?;
821 Box::new(dev) as Box<dyn VirtioDevice>
822 } else {
823 let dev = virtio::Net::<Tap>::new(features, host_ip, netmask, mac_address, vq_pairs)
824 .map_err(Error::NetDeviceNew)?;
825 Box::new(dev) as Box<dyn VirtioDevice>
826 };
827
828 let policy = if cfg.vhost_net {
829 "vhost_net_device"
830 } else {
831 "net_device"
832 };
833
834 Ok(VirtioDeviceStub {
835 dev,
836 jail: simple_jail(&cfg, policy)?,
837 })
838 }
839
create_vhost_user_net_device(cfg: &Config, opt: &VhostUserOption) -> DeviceResult840 fn create_vhost_user_net_device(cfg: &Config, opt: &VhostUserOption) -> DeviceResult {
841 let dev = VhostUserNet::new(virtio::base_features(cfg.protected_vm), &opt.socket)
842 .map_err(Error::VhostUserNetDeviceNew)?;
843
844 Ok(VirtioDeviceStub {
845 dev: Box::new(dev),
846 // no sandbox here because virtqueue handling is exported to a different process.
847 jail: None,
848 })
849 }
850
851 #[cfg(feature = "gpu")]
create_gpu_device( cfg: &Config, exit_evt: &Event, gpu_device_tube: Tube, resource_bridges: Vec<Tube>, wayland_socket_path: Option<&PathBuf>, x_display: Option<String>, event_devices: Vec<EventDevice>, map_request: Arc<Mutex<Option<ExternalMapping>>>, mem: &GuestMemory, ) -> DeviceResult852 fn create_gpu_device(
853 cfg: &Config,
854 exit_evt: &Event,
855 gpu_device_tube: Tube,
856 resource_bridges: Vec<Tube>,
857 wayland_socket_path: Option<&PathBuf>,
858 x_display: Option<String>,
859 event_devices: Vec<EventDevice>,
860 map_request: Arc<Mutex<Option<ExternalMapping>>>,
861 mem: &GuestMemory,
862 ) -> DeviceResult {
863 let jailed_wayland_path = Path::new("/wayland-0");
864
865 let mut display_backends = vec![
866 virtio::DisplayBackend::X(x_display),
867 virtio::DisplayBackend::Stub,
868 ];
869
870 if let Some(socket_path) = wayland_socket_path {
871 display_backends.insert(
872 0,
873 virtio::DisplayBackend::Wayland(if cfg.sandbox {
874 Some(jailed_wayland_path.to_owned())
875 } else {
876 Some(socket_path.to_owned())
877 }),
878 );
879 }
880
881 let dev = virtio::Gpu::new(
882 exit_evt.try_clone().map_err(Error::CloneEvent)?,
883 Some(gpu_device_tube),
884 NonZeroU8::new(1).unwrap(), // number of scanouts
885 resource_bridges,
886 display_backends,
887 cfg.gpu_parameters.as_ref().unwrap(),
888 event_devices,
889 map_request,
890 cfg.sandbox,
891 virtio::base_features(cfg.protected_vm),
892 cfg.wayland_socket_paths.clone(),
893 mem.clone(),
894 );
895
896 let jail = match simple_jail(&cfg, "gpu_device")? {
897 Some(mut jail) => {
898 // Create a tmpfs in the device's root directory so that we can bind mount the
899 // dri directory into it. The size=67108864 is size=64*1024*1024 or size=64MB.
900 jail.mount_with_data(
901 Path::new("none"),
902 Path::new("/"),
903 "tmpfs",
904 (libc::MS_NOSUID | libc::MS_NODEV | libc::MS_NOEXEC) as usize,
905 "size=67108864",
906 )?;
907
908 // Device nodes required for DRM.
909 let sys_dev_char_path = Path::new("/sys/dev/char");
910 jail.mount_bind(sys_dev_char_path, sys_dev_char_path, false)?;
911 let sys_devices_path = Path::new("/sys/devices");
912 jail.mount_bind(sys_devices_path, sys_devices_path, false)?;
913
914 let drm_dri_path = Path::new("/dev/dri");
915 if drm_dri_path.exists() {
916 jail.mount_bind(drm_dri_path, drm_dri_path, false)?;
917 }
918
919 // Prepare GPU shader disk cache directory.
920 if let Some(cache_dir) = cfg
921 .gpu_parameters
922 .as_ref()
923 .and_then(|params| params.cache_path.as_ref())
924 {
925 if cfg!(any(target_arch = "arm", target_arch = "aarch64")) && cfg.sandbox {
926 warn!("shader caching not yet supported on ARM with sandbox enabled");
927 env::set_var("MESA_GLSL_CACHE_DISABLE", "true");
928 } else {
929 env::set_var("MESA_GLSL_CACHE_DISABLE", "false");
930 env::set_var("MESA_GLSL_CACHE_DIR", cache_dir);
931 if let Some(cache_size) = cfg
932 .gpu_parameters
933 .as_ref()
934 .and_then(|params| params.cache_size.as_ref())
935 {
936 env::set_var("MESA_GLSL_CACHE_MAX_SIZE", cache_size);
937 }
938 let shadercache_path = Path::new(cache_dir);
939 jail.mount_bind(shadercache_path, shadercache_path, true)?;
940 }
941 }
942
943 // If the ARM specific devices exist on the host, bind mount them in.
944 let mali0_path = Path::new("/dev/mali0");
945 if mali0_path.exists() {
946 jail.mount_bind(mali0_path, mali0_path, true)?;
947 }
948
949 let pvr_sync_path = Path::new("/dev/pvr_sync");
950 if pvr_sync_path.exists() {
951 jail.mount_bind(pvr_sync_path, pvr_sync_path, true)?;
952 }
953
954 // If the udmabuf driver exists on the host, bind mount it in.
955 let udmabuf_path = Path::new("/dev/udmabuf");
956 if udmabuf_path.exists() {
957 jail.mount_bind(udmabuf_path, udmabuf_path, true)?;
958 }
959
960 // Libraries that are required when mesa drivers are dynamically loaded.
961 let lib_dirs = &[
962 "/usr/lib",
963 "/usr/lib64",
964 "/lib",
965 "/lib64",
966 "/usr/share/vulkan",
967 ];
968 for dir in lib_dirs {
969 let dir_path = Path::new(dir);
970 if dir_path.exists() {
971 jail.mount_bind(dir_path, dir_path, false)?;
972 }
973 }
974
975 // Bind mount the wayland socket into jail's root. This is necessary since each
976 // new wayland context must open() the socket. Don't bind mount the camera socket
977 // since it seems to cause problems on ARCVM (b/180126126) + Mali. It's unclear if
978 // camera team will opt for virtio-camera or continue using virtio-wl, so this should
979 // be fine for now.
980 if let Some(path) = wayland_socket_path {
981 jail.mount_bind(path, jailed_wayland_path, true)?;
982 }
983
984 add_crosvm_user_to_jail(&mut jail, "gpu")?;
985
986 // pvr driver requires read access to /proc/self/task/*/comm.
987 let proc_path = Path::new("/proc");
988 jail.mount(
989 proc_path,
990 proc_path,
991 "proc",
992 (libc::MS_NOSUID | libc::MS_NODEV | libc::MS_NOEXEC | libc::MS_RDONLY) as usize,
993 )?;
994
995 // To enable perfetto tracing, we need to give access to the perfetto service IPC
996 // endpoints.
997 let perfetto_path = Path::new("/run/perfetto");
998 if perfetto_path.exists() {
999 jail.mount_bind(perfetto_path, perfetto_path, true)?;
1000 }
1001
1002 Some(jail)
1003 }
1004 None => None,
1005 };
1006
1007 Ok(VirtioDeviceStub {
1008 dev: Box::new(dev),
1009 jail,
1010 })
1011 }
1012
create_wayland_device( cfg: &Config, control_tube: Tube, resource_bridge: Option<Tube>, ) -> DeviceResult1013 fn create_wayland_device(
1014 cfg: &Config,
1015 control_tube: Tube,
1016 resource_bridge: Option<Tube>,
1017 ) -> DeviceResult {
1018 let wayland_socket_dirs = cfg
1019 .wayland_socket_paths
1020 .iter()
1021 .map(|(_name, path)| path.parent())
1022 .collect::<Option<Vec<_>>>()
1023 .ok_or(Error::InvalidWaylandPath)?;
1024
1025 let features = virtio::base_features(cfg.protected_vm);
1026 let dev = virtio::Wl::new(
1027 features,
1028 cfg.wayland_socket_paths.clone(),
1029 control_tube,
1030 resource_bridge,
1031 )
1032 .map_err(Error::WaylandDeviceNew)?;
1033
1034 let jail = match simple_jail(&cfg, "wl_device")? {
1035 Some(mut jail) => {
1036 // Create a tmpfs in the device's root directory so that we can bind mount the wayland
1037 // socket directory into it. The size=67108864 is size=64*1024*1024 or size=64MB.
1038 jail.mount_with_data(
1039 Path::new("none"),
1040 Path::new("/"),
1041 "tmpfs",
1042 (libc::MS_NOSUID | libc::MS_NODEV | libc::MS_NOEXEC) as usize,
1043 "size=67108864",
1044 )?;
1045
1046 // Bind mount the wayland socket's directory into jail's root. This is necessary since
1047 // each new wayland context must open() the socket. If the wayland socket is ever
1048 // destroyed and remade in the same host directory, new connections will be possible
1049 // without restarting the wayland device.
1050 for dir in &wayland_socket_dirs {
1051 jail.mount_bind(dir, dir, true)?;
1052 }
1053 add_crosvm_user_to_jail(&mut jail, "Wayland")?;
1054
1055 Some(jail)
1056 }
1057 None => None,
1058 };
1059
1060 Ok(VirtioDeviceStub {
1061 dev: Box::new(dev),
1062 jail,
1063 })
1064 }
1065
1066 #[cfg(any(feature = "video-decoder", feature = "video-encoder"))]
create_video_device( cfg: &Config, typ: devices::virtio::VideoDeviceType, resource_bridge: Tube, ) -> DeviceResult1067 fn create_video_device(
1068 cfg: &Config,
1069 typ: devices::virtio::VideoDeviceType,
1070 resource_bridge: Tube,
1071 ) -> DeviceResult {
1072 let jail = match simple_jail(&cfg, "video_device")? {
1073 Some(mut jail) => {
1074 match typ {
1075 devices::virtio::VideoDeviceType::Decoder => {
1076 add_crosvm_user_to_jail(&mut jail, "video-decoder")?
1077 }
1078 devices::virtio::VideoDeviceType::Encoder => {
1079 add_crosvm_user_to_jail(&mut jail, "video-encoder")?
1080 }
1081 };
1082
1083 // Create a tmpfs in the device's root directory so that we can bind mount files.
1084 jail.mount_with_data(
1085 Path::new("none"),
1086 Path::new("/"),
1087 "tmpfs",
1088 (libc::MS_NOSUID | libc::MS_NODEV | libc::MS_NOEXEC) as usize,
1089 "size=67108864",
1090 )?;
1091
1092 // Render node for libvda.
1093 let dev_dri_path = Path::new("/dev/dri/renderD128");
1094 jail.mount_bind(dev_dri_path, dev_dri_path, false)?;
1095
1096 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
1097 {
1098 // Device nodes used by libdrm through minigbm in libvda on AMD devices.
1099 let sys_dev_char_path = Path::new("/sys/dev/char");
1100 jail.mount_bind(sys_dev_char_path, sys_dev_char_path, false)?;
1101 let sys_devices_path = Path::new("/sys/devices");
1102 jail.mount_bind(sys_devices_path, sys_devices_path, false)?;
1103
1104 // Required for loading dri libraries loaded by minigbm on AMD devices.
1105 let lib_dir = Path::new("/usr/lib64");
1106 jail.mount_bind(lib_dir, lib_dir, false)?;
1107 }
1108
1109 // Device nodes required by libchrome which establishes Mojo connection in libvda.
1110 let dev_urandom_path = Path::new("/dev/urandom");
1111 jail.mount_bind(dev_urandom_path, dev_urandom_path, false)?;
1112 let system_bus_socket_path = Path::new("/run/dbus/system_bus_socket");
1113 jail.mount_bind(system_bus_socket_path, system_bus_socket_path, true)?;
1114
1115 Some(jail)
1116 }
1117 None => None,
1118 };
1119
1120 Ok(VirtioDeviceStub {
1121 dev: Box::new(devices::virtio::VideoDevice::new(
1122 virtio::base_features(cfg.protected_vm),
1123 typ,
1124 Some(resource_bridge),
1125 )),
1126 jail,
1127 })
1128 }
1129
1130 #[cfg(any(feature = "video-decoder", feature = "video-encoder"))]
register_video_device( devs: &mut Vec<VirtioDeviceStub>, video_tube: Tube, cfg: &Config, typ: devices::virtio::VideoDeviceType, ) -> std::result::Result<(), Error>1131 fn register_video_device(
1132 devs: &mut Vec<VirtioDeviceStub>,
1133 video_tube: Tube,
1134 cfg: &Config,
1135 typ: devices::virtio::VideoDeviceType,
1136 ) -> std::result::Result<(), Error> {
1137 devs.push(create_video_device(cfg, typ, video_tube)?);
1138 Ok(())
1139 }
1140
create_vhost_vsock_device(cfg: &Config, cid: u64, mem: &GuestMemory) -> DeviceResult1141 fn create_vhost_vsock_device(cfg: &Config, cid: u64, mem: &GuestMemory) -> DeviceResult {
1142 let features = virtio::base_features(cfg.protected_vm);
1143 let dev = virtio::vhost::Vsock::new(&cfg.vhost_vsock_device_path, features, cid, mem)
1144 .map_err(Error::VhostVsockDeviceNew)?;
1145
1146 Ok(VirtioDeviceStub {
1147 dev: Box::new(dev),
1148 jail: simple_jail(&cfg, "vhost_vsock_device")?,
1149 })
1150 }
1151
create_fs_device( cfg: &Config, uid_map: &str, gid_map: &str, src: &Path, tag: &str, fs_cfg: virtio::fs::passthrough::Config, device_tube: Tube, ) -> DeviceResult1152 fn create_fs_device(
1153 cfg: &Config,
1154 uid_map: &str,
1155 gid_map: &str,
1156 src: &Path,
1157 tag: &str,
1158 fs_cfg: virtio::fs::passthrough::Config,
1159 device_tube: Tube,
1160 ) -> DeviceResult {
1161 let max_open_files = get_max_open_files()?;
1162 let j = if cfg.sandbox {
1163 let seccomp_policy = cfg.seccomp_policy_dir.join("fs_device");
1164 let config = SandboxConfig {
1165 limit_caps: false,
1166 uid_map: Some(uid_map),
1167 gid_map: Some(gid_map),
1168 log_failures: cfg.seccomp_log_failures,
1169 seccomp_policy: &seccomp_policy,
1170 };
1171 let mut jail = create_base_minijail(src, Some(max_open_files), Some(&config))?;
1172 // We want bind mounts from the parent namespaces to propagate into the fs device's
1173 // namespace.
1174 jail.set_remount_mode(libc::MS_SLAVE);
1175
1176 jail
1177 } else {
1178 create_base_minijail(src, Some(max_open_files), None)?
1179 };
1180
1181 let features = virtio::base_features(cfg.protected_vm);
1182 // TODO(chirantan): Use more than one worker once the kernel driver has been fixed to not panic
1183 // when num_queues > 1.
1184 let dev =
1185 virtio::fs::Fs::new(features, tag, 1, fs_cfg, device_tube).map_err(Error::FsDeviceNew)?;
1186
1187 Ok(VirtioDeviceStub {
1188 dev: Box::new(dev),
1189 jail: Some(j),
1190 })
1191 }
1192
create_9p_device( cfg: &Config, uid_map: &str, gid_map: &str, src: &Path, tag: &str, mut p9_cfg: p9::Config, ) -> DeviceResult1193 fn create_9p_device(
1194 cfg: &Config,
1195 uid_map: &str,
1196 gid_map: &str,
1197 src: &Path,
1198 tag: &str,
1199 mut p9_cfg: p9::Config,
1200 ) -> DeviceResult {
1201 let max_open_files = get_max_open_files()?;
1202 let (jail, root) = if cfg.sandbox {
1203 let seccomp_policy = cfg.seccomp_policy_dir.join("9p_device");
1204 let config = SandboxConfig {
1205 limit_caps: false,
1206 uid_map: Some(uid_map),
1207 gid_map: Some(gid_map),
1208 log_failures: cfg.seccomp_log_failures,
1209 seccomp_policy: &seccomp_policy,
1210 };
1211
1212 let mut jail = create_base_minijail(src, Some(max_open_files), Some(&config))?;
1213 // We want bind mounts from the parent namespaces to propagate into the 9p server's
1214 // namespace.
1215 jail.set_remount_mode(libc::MS_SLAVE);
1216
1217 // The shared directory becomes the root of the device's file system.
1218 let root = Path::new("/");
1219 (Some(jail), root)
1220 } else {
1221 // There's no mount namespace so we tell the server to treat the source directory as the
1222 // root.
1223 (None, src)
1224 };
1225
1226 let features = virtio::base_features(cfg.protected_vm);
1227 p9_cfg.root = root.into();
1228 let dev = virtio::P9::new(features, tag, p9_cfg).map_err(Error::P9DeviceNew)?;
1229
1230 Ok(VirtioDeviceStub {
1231 dev: Box::new(dev),
1232 jail,
1233 })
1234 }
1235
create_pmem_device( cfg: &Config, vm: &mut impl Vm, resources: &mut SystemAllocator, disk: &DiskOption, index: usize, pmem_device_tube: Tube, ) -> DeviceResult1236 fn create_pmem_device(
1237 cfg: &Config,
1238 vm: &mut impl Vm,
1239 resources: &mut SystemAllocator,
1240 disk: &DiskOption,
1241 index: usize,
1242 pmem_device_tube: Tube,
1243 ) -> DeviceResult {
1244 // Special case '/proc/self/fd/*' paths. The FD is already open, just use it.
1245 let fd: File = if disk.path.parent() == Some(Path::new("/proc/self/fd")) {
1246 // Safe because we will validate |raw_fd|.
1247 unsafe { File::from_raw_descriptor(raw_descriptor_from_path(&disk.path)?) }
1248 } else {
1249 OpenOptions::new()
1250 .read(true)
1251 .write(!disk.read_only)
1252 .open(&disk.path)
1253 .map_err(|e| Error::Disk(disk.path.to_path_buf(), e))?
1254 };
1255
1256 let arena_size = {
1257 let metadata =
1258 std::fs::metadata(&disk.path).map_err(|e| Error::Disk(disk.path.to_path_buf(), e))?;
1259 let disk_len = metadata.len();
1260 // Linux requires pmem region sizes to be 2 MiB aligned. Linux will fill any partial page
1261 // at the end of an mmap'd file and won't write back beyond the actual file length, but if
1262 // we just align the size of the file to 2 MiB then access beyond the last page of the
1263 // mapped file will generate SIGBUS. So use a memory mapping arena that will provide
1264 // padding up to 2 MiB.
1265 let alignment = 2 * 1024 * 1024;
1266 let align_adjust = if disk_len % alignment != 0 {
1267 alignment - (disk_len % alignment)
1268 } else {
1269 0
1270 };
1271 disk_len
1272 .checked_add(align_adjust)
1273 .ok_or(Error::PmemDeviceImageTooBig)?
1274 };
1275
1276 let protection = {
1277 if disk.read_only {
1278 Protection::read()
1279 } else {
1280 Protection::read_write()
1281 }
1282 };
1283
1284 let arena = {
1285 // Conversion from u64 to usize may fail on 32bit system.
1286 let arena_size = usize::try_from(arena_size).map_err(|_| Error::PmemDeviceImageTooBig)?;
1287
1288 let mut arena = MemoryMappingArena::new(arena_size).map_err(Error::ReservePmemMemory)?;
1289 arena
1290 .add_fd_offset_protection(0, arena_size, &fd, 0, protection)
1291 .map_err(Error::ReservePmemMemory)?;
1292 arena
1293 };
1294
1295 let mapping_address = resources
1296 .mmio_allocator(MmioType::High)
1297 .allocate_with_align(
1298 arena_size,
1299 Alloc::PmemDevice(index),
1300 format!("pmem_disk_image_{}", index),
1301 // Linux kernel requires pmem namespaces to be 128 MiB aligned.
1302 128 * 1024 * 1024, /* 128 MiB */
1303 )
1304 .map_err(Error::AllocatePmemDeviceAddress)?;
1305
1306 let slot = vm
1307 .add_memory_region(
1308 GuestAddress(mapping_address),
1309 Box::new(arena),
1310 /* read_only = */ disk.read_only,
1311 /* log_dirty_pages = */ false,
1312 )
1313 .map_err(Error::AddPmemDeviceMemory)?;
1314
1315 let dev = virtio::Pmem::new(
1316 virtio::base_features(cfg.protected_vm),
1317 fd,
1318 GuestAddress(mapping_address),
1319 slot,
1320 arena_size,
1321 Some(pmem_device_tube),
1322 )
1323 .map_err(Error::PmemDeviceNew)?;
1324
1325 Ok(VirtioDeviceStub {
1326 dev: Box::new(dev) as Box<dyn VirtioDevice>,
1327 jail: simple_jail(&cfg, "pmem_device")?,
1328 })
1329 }
1330
create_console_device(cfg: &Config, param: &SerialParameters) -> DeviceResult1331 fn create_console_device(cfg: &Config, param: &SerialParameters) -> DeviceResult {
1332 let mut keep_rds = Vec::new();
1333 let evt = Event::new().map_err(Error::CreateEvent)?;
1334 let dev = param
1335 .create_serial_device::<Console>(cfg.protected_vm, &evt, &mut keep_rds)
1336 .map_err(Error::CreateConsole)?;
1337
1338 let jail = match simple_jail(&cfg, "serial")? {
1339 Some(mut jail) => {
1340 // Create a tmpfs in the device's root directory so that we can bind mount the
1341 // log socket directory into it.
1342 // The size=67108864 is size=64*1024*1024 or size=64MB.
1343 jail.mount_with_data(
1344 Path::new("none"),
1345 Path::new("/"),
1346 "tmpfs",
1347 (libc::MS_NODEV | libc::MS_NOEXEC | libc::MS_NOSUID) as usize,
1348 "size=67108864",
1349 )?;
1350 add_crosvm_user_to_jail(&mut jail, "serial")?;
1351 let res = param.add_bind_mounts(&mut jail);
1352 if res.is_err() {
1353 error!("failed to add bind mounts for console device");
1354 }
1355 Some(jail)
1356 }
1357 None => None,
1358 };
1359
1360 Ok(VirtioDeviceStub {
1361 dev: Box::new(dev),
1362 jail, // TODO(dverkamp): use a separate policy for console?
1363 })
1364 }
1365
1366 // gpu_device_tube is not used when GPU support is disabled.
1367 #[cfg_attr(not(feature = "gpu"), allow(unused_variables))]
create_virtio_devices( cfg: &Config, mem: &GuestMemory, vm: &mut impl Vm, resources: &mut SystemAllocator, _exit_evt: &Event, wayland_device_tube: Tube, gpu_device_tube: Tube, balloon_device_tube: Tube, disk_device_tubes: &mut Vec<Tube>, pmem_device_tubes: &mut Vec<Tube>, map_request: Arc<Mutex<Option<ExternalMapping>>>, fs_device_tubes: &mut Vec<Tube>, ) -> DeviceResult<Vec<VirtioDeviceStub>>1368 fn create_virtio_devices(
1369 cfg: &Config,
1370 mem: &GuestMemory,
1371 vm: &mut impl Vm,
1372 resources: &mut SystemAllocator,
1373 _exit_evt: &Event,
1374 wayland_device_tube: Tube,
1375 gpu_device_tube: Tube,
1376 balloon_device_tube: Tube,
1377 disk_device_tubes: &mut Vec<Tube>,
1378 pmem_device_tubes: &mut Vec<Tube>,
1379 map_request: Arc<Mutex<Option<ExternalMapping>>>,
1380 fs_device_tubes: &mut Vec<Tube>,
1381 ) -> DeviceResult<Vec<VirtioDeviceStub>> {
1382 let mut devs = Vec::new();
1383
1384 for (_, param) in cfg
1385 .serial_parameters
1386 .iter()
1387 .filter(|(_k, v)| v.hardware == SerialHardware::VirtioConsole)
1388 {
1389 let dev = create_console_device(cfg, param)?;
1390 devs.push(dev);
1391 }
1392
1393 for disk in &cfg.disks {
1394 let disk_device_tube = disk_device_tubes.remove(0);
1395 devs.push(create_block_device(cfg, disk, disk_device_tube)?);
1396 }
1397
1398 for blk in &cfg.vhost_user_blk {
1399 devs.push(create_vhost_user_block_device(cfg, blk)?);
1400 }
1401
1402 for (index, pmem_disk) in cfg.pmem_devices.iter().enumerate() {
1403 let pmem_device_tube = pmem_device_tubes.remove(0);
1404 devs.push(create_pmem_device(
1405 cfg,
1406 vm,
1407 resources,
1408 pmem_disk,
1409 index,
1410 pmem_device_tube,
1411 )?);
1412 }
1413
1414 devs.push(create_rng_device(cfg)?);
1415
1416 #[cfg(feature = "tpm")]
1417 {
1418 if cfg.software_tpm {
1419 devs.push(create_tpm_device(cfg)?);
1420 }
1421 }
1422
1423 if let Some(single_touch_spec) = &cfg.virtio_single_touch {
1424 devs.push(create_single_touch_device(cfg, single_touch_spec)?);
1425 }
1426
1427 if let Some(multi_touch_spec) = &cfg.virtio_multi_touch {
1428 devs.push(create_multi_touch_device(cfg, multi_touch_spec)?);
1429 }
1430
1431 if let Some(trackpad_spec) = &cfg.virtio_trackpad {
1432 devs.push(create_trackpad_device(cfg, trackpad_spec)?);
1433 }
1434
1435 if let Some(mouse_socket) = &cfg.virtio_mouse {
1436 devs.push(create_mouse_device(cfg, mouse_socket)?);
1437 }
1438
1439 if let Some(keyboard_socket) = &cfg.virtio_keyboard {
1440 devs.push(create_keyboard_device(cfg, keyboard_socket)?);
1441 }
1442
1443 if let Some(switches_socket) = &cfg.virtio_switches {
1444 devs.push(create_switches_device(cfg, switches_socket)?);
1445 }
1446
1447 for dev_path in &cfg.virtio_input_evdevs {
1448 devs.push(create_vinput_device(cfg, dev_path)?);
1449 }
1450
1451 devs.push(create_balloon_device(cfg, balloon_device_tube)?);
1452
1453 // We checked above that if the IP is defined, then the netmask is, too.
1454 for tap_fd in &cfg.tap_fd {
1455 devs.push(create_tap_net_device(cfg, *tap_fd)?);
1456 }
1457
1458 if let (Some(host_ip), Some(netmask), Some(mac_address)) =
1459 (cfg.host_ip, cfg.netmask, cfg.mac_address)
1460 {
1461 if !cfg.vhost_user_net.is_empty() {
1462 return Err(Error::VhostUserNetWithNetArgs);
1463 }
1464 devs.push(create_net_device(cfg, host_ip, netmask, mac_address, mem)?);
1465 }
1466
1467 for net in &cfg.vhost_user_net {
1468 devs.push(create_vhost_user_net_device(cfg, net)?);
1469 }
1470
1471 #[cfg_attr(not(feature = "gpu"), allow(unused_mut))]
1472 let mut resource_bridges = Vec::<Tube>::new();
1473
1474 if !cfg.wayland_socket_paths.is_empty() {
1475 #[cfg_attr(not(feature = "gpu"), allow(unused_mut))]
1476 let mut wl_resource_bridge = None::<Tube>;
1477
1478 #[cfg(feature = "gpu")]
1479 {
1480 if cfg.gpu_parameters.is_some() {
1481 let (wl_socket, gpu_socket) = Tube::pair().map_err(Error::CreateTube)?;
1482 resource_bridges.push(gpu_socket);
1483 wl_resource_bridge = Some(wl_socket);
1484 }
1485 }
1486
1487 devs.push(create_wayland_device(
1488 cfg,
1489 wayland_device_tube,
1490 wl_resource_bridge,
1491 )?);
1492 }
1493
1494 #[cfg(feature = "video-decoder")]
1495 let video_dec_tube = if cfg.video_dec {
1496 let (video_tube, gpu_tube) = Tube::pair().map_err(Error::CreateTube)?;
1497 resource_bridges.push(gpu_tube);
1498 Some(video_tube)
1499 } else {
1500 None
1501 };
1502
1503 #[cfg(feature = "video-encoder")]
1504 let video_enc_tube = if cfg.video_enc {
1505 let (video_tube, gpu_tube) = Tube::pair().map_err(Error::CreateTube)?;
1506 resource_bridges.push(gpu_tube);
1507 Some(video_tube)
1508 } else {
1509 None
1510 };
1511
1512 #[cfg(feature = "gpu")]
1513 {
1514 if let Some(gpu_parameters) = &cfg.gpu_parameters {
1515 let mut event_devices = Vec::new();
1516 if cfg.display_window_mouse {
1517 let (event_device_socket, virtio_dev_socket) =
1518 UnixStream::pair().map_err(Error::CreateSocket)?;
1519 let (multi_touch_width, multi_touch_height) = cfg
1520 .virtio_multi_touch
1521 .as_ref()
1522 .map(|multi_touch_spec| multi_touch_spec.get_size())
1523 .unwrap_or((gpu_parameters.display_width, gpu_parameters.display_height));
1524 let dev = virtio::new_multi_touch(
1525 virtio_dev_socket,
1526 multi_touch_width,
1527 multi_touch_height,
1528 virtio::base_features(cfg.protected_vm),
1529 )
1530 .map_err(Error::InputDeviceNew)?;
1531 devs.push(VirtioDeviceStub {
1532 dev: Box::new(dev),
1533 jail: simple_jail(&cfg, "input_device")?,
1534 });
1535 event_devices.push(EventDevice::touchscreen(event_device_socket));
1536 }
1537 if cfg.display_window_keyboard {
1538 let (event_device_socket, virtio_dev_socket) =
1539 UnixStream::pair().map_err(Error::CreateSocket)?;
1540 let dev = virtio::new_keyboard(
1541 virtio_dev_socket,
1542 virtio::base_features(cfg.protected_vm),
1543 )
1544 .map_err(Error::InputDeviceNew)?;
1545 devs.push(VirtioDeviceStub {
1546 dev: Box::new(dev),
1547 jail: simple_jail(&cfg, "input_device")?,
1548 });
1549 event_devices.push(EventDevice::keyboard(event_device_socket));
1550 }
1551 devs.push(create_gpu_device(
1552 cfg,
1553 _exit_evt,
1554 gpu_device_tube,
1555 resource_bridges,
1556 // Use the unnamed socket for GPU display screens.
1557 cfg.wayland_socket_paths.get(""),
1558 cfg.x_display.clone(),
1559 event_devices,
1560 map_request,
1561 mem,
1562 )?);
1563 }
1564 }
1565
1566 #[cfg(feature = "video-decoder")]
1567 {
1568 if let Some(video_dec_tube) = video_dec_tube {
1569 register_video_device(
1570 &mut devs,
1571 video_dec_tube,
1572 cfg,
1573 devices::virtio::VideoDeviceType::Decoder,
1574 )?;
1575 }
1576 }
1577
1578 #[cfg(feature = "video-encoder")]
1579 {
1580 if let Some(video_enc_tube) = video_enc_tube {
1581 register_video_device(
1582 &mut devs,
1583 video_enc_tube,
1584 cfg,
1585 devices::virtio::VideoDeviceType::Encoder,
1586 )?;
1587 }
1588 }
1589
1590 if let Some(cid) = cfg.cid {
1591 devs.push(create_vhost_vsock_device(cfg, cid, mem)?);
1592 }
1593
1594 for vhost_user_fs in &cfg.vhost_user_fs {
1595 devs.push(create_vhost_user_fs_device(cfg, &vhost_user_fs)?);
1596 }
1597
1598 for shared_dir in &cfg.shared_dirs {
1599 let SharedDir {
1600 src,
1601 tag,
1602 kind,
1603 uid_map,
1604 gid_map,
1605 fs_cfg,
1606 p9_cfg,
1607 } = shared_dir;
1608
1609 let dev = match kind {
1610 SharedDirKind::FS => {
1611 let device_tube = fs_device_tubes.remove(0);
1612 create_fs_device(cfg, uid_map, gid_map, src, tag, fs_cfg.clone(), device_tube)?
1613 }
1614 SharedDirKind::P9 => create_9p_device(cfg, uid_map, gid_map, src, tag, p9_cfg.clone())?,
1615 };
1616 devs.push(dev);
1617 }
1618
1619 Ok(devs)
1620 }
1621
create_devices( cfg: &Config, mem: &GuestMemory, vm: &mut impl Vm, resources: &mut SystemAllocator, exit_evt: &Event, control_tubes: &mut Vec<TaggedControlTube>, wayland_device_tube: Tube, gpu_device_tube: Tube, balloon_device_tube: Tube, disk_device_tubes: &mut Vec<Tube>, pmem_device_tubes: &mut Vec<Tube>, fs_device_tubes: &mut Vec<Tube>, usb_provider: HostBackendDeviceProvider, map_request: Arc<Mutex<Option<ExternalMapping>>>, ) -> DeviceResult<Vec<(Box<dyn PciDevice>, Option<Minijail>)>>1622 fn create_devices(
1623 cfg: &Config,
1624 mem: &GuestMemory,
1625 vm: &mut impl Vm,
1626 resources: &mut SystemAllocator,
1627 exit_evt: &Event,
1628 control_tubes: &mut Vec<TaggedControlTube>,
1629 wayland_device_tube: Tube,
1630 gpu_device_tube: Tube,
1631 balloon_device_tube: Tube,
1632 disk_device_tubes: &mut Vec<Tube>,
1633 pmem_device_tubes: &mut Vec<Tube>,
1634 fs_device_tubes: &mut Vec<Tube>,
1635 usb_provider: HostBackendDeviceProvider,
1636 map_request: Arc<Mutex<Option<ExternalMapping>>>,
1637 ) -> DeviceResult<Vec<(Box<dyn PciDevice>, Option<Minijail>)>> {
1638 let stubs = create_virtio_devices(
1639 &cfg,
1640 mem,
1641 vm,
1642 resources,
1643 exit_evt,
1644 wayland_device_tube,
1645 gpu_device_tube,
1646 balloon_device_tube,
1647 disk_device_tubes,
1648 pmem_device_tubes,
1649 map_request,
1650 fs_device_tubes,
1651 )?;
1652
1653 let mut pci_devices = Vec::new();
1654
1655 for stub in stubs {
1656 let (msi_host_tube, msi_device_tube) = Tube::pair().map_err(Error::CreateTube)?;
1657 control_tubes.push(TaggedControlTube::VmIrq(msi_host_tube));
1658 let dev = VirtioPciDevice::new(mem.clone(), stub.dev, msi_device_tube)
1659 .map_err(Error::VirtioPciDev)?;
1660 let dev = Box::new(dev) as Box<dyn PciDevice>;
1661 pci_devices.push((dev, stub.jail));
1662 }
1663
1664 #[cfg(feature = "audio")]
1665 for ac97_param in &cfg.ac97_parameters {
1666 let dev = Ac97Dev::try_new(mem.clone(), ac97_param.clone()).map_err(Error::CreateAc97)?;
1667 let jail = simple_jail(&cfg, dev.minijail_policy())?;
1668 pci_devices.push((Box::new(dev), jail));
1669 }
1670
1671 // Create xhci controller.
1672 let usb_controller = Box::new(XhciController::new(mem.clone(), usb_provider));
1673 pci_devices.push((usb_controller, simple_jail(&cfg, "xhci")?));
1674
1675 if !cfg.vfio.is_empty() {
1676 let vfio_container = Arc::new(Mutex::new(
1677 VfioContainer::new().map_err(Error::CreateVfioDevice)?,
1678 ));
1679
1680 for vfio_path in &cfg.vfio {
1681 // create MSI, MSI-X, and Mem request sockets for each vfio device
1682 let (vfio_host_tube_msi, vfio_device_tube_msi) =
1683 Tube::pair().map_err(Error::CreateTube)?;
1684 control_tubes.push(TaggedControlTube::VmIrq(vfio_host_tube_msi));
1685
1686 let (vfio_host_tube_msix, vfio_device_tube_msix) =
1687 Tube::pair().map_err(Error::CreateTube)?;
1688 control_tubes.push(TaggedControlTube::VmIrq(vfio_host_tube_msix));
1689
1690 let (vfio_host_tube_mem, vfio_device_tube_mem) =
1691 Tube::pair().map_err(Error::CreateTube)?;
1692 control_tubes.push(TaggedControlTube::VmMemory(vfio_host_tube_mem));
1693
1694 let vfiodevice = VfioDevice::new(vfio_path.as_path(), vm, mem, vfio_container.clone())
1695 .map_err(Error::CreateVfioDevice)?;
1696 let mut vfiopcidevice = Box::new(VfioPciDevice::new(
1697 vfiodevice,
1698 vfio_device_tube_msi,
1699 vfio_device_tube_msix,
1700 vfio_device_tube_mem,
1701 ));
1702 // early reservation for pass-through PCI devices.
1703 if vfiopcidevice.allocate_address(resources).is_err() {
1704 warn!(
1705 "address reservation failed for vfio {}",
1706 vfiopcidevice.debug_label()
1707 );
1708 }
1709 pci_devices.push((vfiopcidevice, simple_jail(&cfg, "vfio_device")?));
1710 }
1711 }
1712
1713 Ok(pci_devices)
1714 }
1715
1716 #[derive(Copy, Clone)]
1717 #[cfg_attr(not(feature = "tpm"), allow(dead_code))]
1718 struct Ids {
1719 uid: uid_t,
1720 gid: gid_t,
1721 }
1722
1723 // Set the uid/gid for the jailed process and give a basic id map. This is
1724 // required for bind mounts to work.
add_crosvm_user_to_jail(jail: &mut Minijail, feature: &str) -> Result<Ids>1725 fn add_crosvm_user_to_jail(jail: &mut Minijail, feature: &str) -> Result<Ids> {
1726 let crosvm_user_group = CStr::from_bytes_with_nul(b"crosvm\0").unwrap();
1727
1728 let crosvm_uid = match get_user_id(&crosvm_user_group) {
1729 Ok(u) => u,
1730 Err(e) => {
1731 warn!("falling back to current user id for {}: {}", feature, e);
1732 geteuid()
1733 }
1734 };
1735
1736 let crosvm_gid = match get_group_id(&crosvm_user_group) {
1737 Ok(u) => u,
1738 Err(e) => {
1739 warn!("falling back to current group id for {}: {}", feature, e);
1740 getegid()
1741 }
1742 };
1743
1744 jail.change_uid(crosvm_uid);
1745 jail.change_gid(crosvm_gid);
1746 jail.uidmap(&format!("{0} {0} 1", crosvm_uid))
1747 .map_err(Error::SettingUidMap)?;
1748 jail.gidmap(&format!("{0} {0} 1", crosvm_gid))
1749 .map_err(Error::SettingGidMap)?;
1750
1751 Ok(Ids {
1752 uid: crosvm_uid,
1753 gid: crosvm_gid,
1754 })
1755 }
1756
raw_descriptor_from_path(path: &Path) -> Result<RawDescriptor>1757 fn raw_descriptor_from_path(path: &Path) -> Result<RawDescriptor> {
1758 if !path.is_file() {
1759 return Err(Error::InvalidFdPath);
1760 }
1761 let raw_descriptor = path
1762 .file_name()
1763 .and_then(|fd_osstr| fd_osstr.to_str())
1764 .and_then(|fd_str| fd_str.parse::<c_int>().ok())
1765 .ok_or(Error::InvalidFdPath)?;
1766 validate_raw_descriptor(raw_descriptor).map_err(Error::ValidateRawDescriptor)
1767 }
1768
1769 trait IntoUnixStream {
into_unix_stream(self) -> Result<UnixStream>1770 fn into_unix_stream(self) -> Result<UnixStream>;
1771 }
1772
1773 impl<'a> IntoUnixStream for &'a Path {
into_unix_stream(self) -> Result<UnixStream>1774 fn into_unix_stream(self) -> Result<UnixStream> {
1775 if self.parent() == Some(Path::new("/proc/self/fd")) {
1776 // Safe because we will validate |raw_fd|.
1777 unsafe { Ok(UnixStream::from_raw_fd(raw_descriptor_from_path(self)?)) }
1778 } else {
1779 UnixStream::connect(self).map_err(Error::InputEventsOpen)
1780 }
1781 }
1782 }
1783 impl<'a> IntoUnixStream for &'a PathBuf {
into_unix_stream(self) -> Result<UnixStream>1784 fn into_unix_stream(self) -> Result<UnixStream> {
1785 self.as_path().into_unix_stream()
1786 }
1787 }
1788
1789 impl IntoUnixStream for UnixStream {
into_unix_stream(self) -> Result<UnixStream>1790 fn into_unix_stream(self) -> Result<UnixStream> {
1791 Ok(self)
1792 }
1793 }
1794
setup_vcpu_signal_handler<T: Vcpu>(use_hypervisor_signals: bool) -> Result<()>1795 fn setup_vcpu_signal_handler<T: Vcpu>(use_hypervisor_signals: bool) -> Result<()> {
1796 if use_hypervisor_signals {
1797 unsafe {
1798 extern "C" fn handle_signal(_: c_int) {}
1799 // Our signal handler does nothing and is trivially async signal safe.
1800 register_rt_signal_handler(SIGRTMIN() + 0, handle_signal)
1801 .map_err(Error::RegisterSignalHandler)?;
1802 }
1803 block_signal(SIGRTMIN() + 0).map_err(Error::BlockSignal)?;
1804 } else {
1805 unsafe {
1806 extern "C" fn handle_signal<T: Vcpu>(_: c_int) {
1807 T::set_local_immediate_exit(true);
1808 }
1809 register_rt_signal_handler(SIGRTMIN() + 0, handle_signal::<T>)
1810 .map_err(Error::RegisterSignalHandler)?;
1811 }
1812 }
1813 Ok(())
1814 }
1815
1816 // Sets up a vcpu and converts it into a runnable vcpu.
runnable_vcpu<V>( cpu_id: usize, vcpu: Option<V>, vm: impl VmArch, irq_chip: &mut impl IrqChipArch, vcpu_count: usize, run_rt: bool, vcpu_affinity: Vec<usize>, no_smt: bool, has_bios: bool, use_hypervisor_signals: bool, ) -> Result<(V, VcpuRunHandle)> where V: VcpuArch,1817 fn runnable_vcpu<V>(
1818 cpu_id: usize,
1819 vcpu: Option<V>,
1820 vm: impl VmArch,
1821 irq_chip: &mut impl IrqChipArch,
1822 vcpu_count: usize,
1823 run_rt: bool,
1824 vcpu_affinity: Vec<usize>,
1825 no_smt: bool,
1826 has_bios: bool,
1827 use_hypervisor_signals: bool,
1828 ) -> Result<(V, VcpuRunHandle)>
1829 where
1830 V: VcpuArch,
1831 {
1832 let mut vcpu = match vcpu {
1833 Some(v) => v,
1834 None => {
1835 // If vcpu is None, it means this arch/hypervisor requires create_vcpu to be called from
1836 // the vcpu thread.
1837 match vm
1838 .create_vcpu(cpu_id)
1839 .map_err(Error::CreateVcpu)?
1840 .downcast::<V>()
1841 {
1842 Ok(v) => *v,
1843 Err(_) => panic!("VM created wrong type of VCPU"),
1844 }
1845 }
1846 };
1847
1848 irq_chip
1849 .add_vcpu(cpu_id, &vcpu)
1850 .map_err(Error::AddIrqChipVcpu)?;
1851
1852 if !vcpu_affinity.is_empty() {
1853 if let Err(e) = set_cpu_affinity(vcpu_affinity) {
1854 error!("Failed to set CPU affinity: {}", e);
1855 }
1856 }
1857
1858 Arch::configure_vcpu(
1859 vm.get_memory(),
1860 vm.get_hypervisor(),
1861 irq_chip,
1862 &mut vcpu,
1863 cpu_id,
1864 vcpu_count,
1865 has_bios,
1866 no_smt,
1867 )
1868 .map_err(Error::ConfigureVcpu)?;
1869
1870 #[cfg(feature = "chromeos")]
1871 if let Err(e) = base::sched::enable_core_scheduling() {
1872 error!("Failed to enable core scheduling: {}", e);
1873 }
1874
1875 if run_rt {
1876 const DEFAULT_VCPU_RT_LEVEL: u16 = 6;
1877 if let Err(e) = set_rt_prio_limit(u64::from(DEFAULT_VCPU_RT_LEVEL))
1878 .and_then(|_| set_rt_round_robin(i32::from(DEFAULT_VCPU_RT_LEVEL)))
1879 {
1880 warn!("Failed to set vcpu to real time: {}", e);
1881 }
1882 }
1883
1884 if use_hypervisor_signals {
1885 let mut v = get_blocked_signals().map_err(Error::GetSignalMask)?;
1886 v.retain(|&x| x != SIGRTMIN() + 0);
1887 vcpu.set_signal_mask(&v).map_err(Error::SettingSignalMask)?;
1888 }
1889
1890 let vcpu_run_handle = vcpu
1891 .take_run_handle(Some(SIGRTMIN() + 0))
1892 .map_err(Error::RunnableVcpu)?;
1893
1894 Ok((vcpu, vcpu_run_handle))
1895 }
1896
1897 #[cfg(all(target_arch = "x86_64", feature = "gdb"))]
handle_debug_msg<V>( cpu_id: usize, vcpu: &V, guest_mem: &GuestMemory, d: VcpuDebug, reply_tube: &mpsc::Sender<VcpuDebugStatusMessage>, ) -> Result<()> where V: VcpuArch + 'static,1898 fn handle_debug_msg<V>(
1899 cpu_id: usize,
1900 vcpu: &V,
1901 guest_mem: &GuestMemory,
1902 d: VcpuDebug,
1903 reply_tube: &mpsc::Sender<VcpuDebugStatusMessage>,
1904 ) -> Result<()>
1905 where
1906 V: VcpuArch + 'static,
1907 {
1908 match d {
1909 VcpuDebug::ReadRegs => {
1910 let msg = VcpuDebugStatusMessage {
1911 cpu: cpu_id as usize,
1912 msg: VcpuDebugStatus::RegValues(
1913 Arch::debug_read_registers(vcpu as &V).map_err(Error::HandleDebugCommand)?,
1914 ),
1915 };
1916 reply_tube
1917 .send(msg)
1918 .map_err(|e| Error::SendDebugStatus(Box::new(e)))
1919 }
1920 VcpuDebug::WriteRegs(regs) => {
1921 Arch::debug_write_registers(vcpu as &V, ®s).map_err(Error::HandleDebugCommand)?;
1922 reply_tube
1923 .send(VcpuDebugStatusMessage {
1924 cpu: cpu_id as usize,
1925 msg: VcpuDebugStatus::CommandComplete,
1926 })
1927 .map_err(|e| Error::SendDebugStatus(Box::new(e)))
1928 }
1929 VcpuDebug::ReadMem(vaddr, len) => {
1930 let msg = VcpuDebugStatusMessage {
1931 cpu: cpu_id as usize,
1932 msg: VcpuDebugStatus::MemoryRegion(
1933 Arch::debug_read_memory(vcpu as &V, guest_mem, vaddr, len)
1934 .unwrap_or(Vec::new()),
1935 ),
1936 };
1937 reply_tube
1938 .send(msg)
1939 .map_err(|e| Error::SendDebugStatus(Box::new(e)))
1940 }
1941 VcpuDebug::WriteMem(vaddr, buf) => {
1942 Arch::debug_write_memory(vcpu as &V, guest_mem, vaddr, &buf)
1943 .map_err(Error::HandleDebugCommand)?;
1944 reply_tube
1945 .send(VcpuDebugStatusMessage {
1946 cpu: cpu_id as usize,
1947 msg: VcpuDebugStatus::CommandComplete,
1948 })
1949 .map_err(|e| Error::SendDebugStatus(Box::new(e)))
1950 }
1951 VcpuDebug::EnableSinglestep => {
1952 Arch::debug_enable_singlestep(vcpu as &V).map_err(Error::HandleDebugCommand)?;
1953 reply_tube
1954 .send(VcpuDebugStatusMessage {
1955 cpu: cpu_id as usize,
1956 msg: VcpuDebugStatus::CommandComplete,
1957 })
1958 .map_err(|e| Error::SendDebugStatus(Box::new(e)))
1959 }
1960 VcpuDebug::SetHwBreakPoint(addrs) => {
1961 Arch::debug_set_hw_breakpoints(vcpu as &V, &addrs)
1962 .map_err(Error::HandleDebugCommand)?;
1963 reply_tube
1964 .send(VcpuDebugStatusMessage {
1965 cpu: cpu_id as usize,
1966 msg: VcpuDebugStatus::CommandComplete,
1967 })
1968 .map_err(|e| Error::SendDebugStatus(Box::new(e)))
1969 }
1970 }
1971 }
1972
run_vcpu<V>( cpu_id: usize, vcpu: Option<V>, vm: impl VmArch + 'static, mut irq_chip: impl IrqChipArch + 'static, vcpu_count: usize, run_rt: bool, vcpu_affinity: Vec<usize>, no_smt: bool, start_barrier: Arc<Barrier>, has_bios: bool, io_bus: devices::Bus, mmio_bus: devices::Bus, exit_evt: Event, requires_pvclock_ctrl: bool, from_main_tube: mpsc::Receiver<VcpuControl>, use_hypervisor_signals: bool, #[cfg(all(target_arch = "x86_64", feature = "gdb"))] to_gdb_tube: Option< mpsc::Sender<VcpuDebugStatusMessage>, >, ) -> Result<JoinHandle<()>> where V: VcpuArch + 'static,1973 fn run_vcpu<V>(
1974 cpu_id: usize,
1975 vcpu: Option<V>,
1976 vm: impl VmArch + 'static,
1977 mut irq_chip: impl IrqChipArch + 'static,
1978 vcpu_count: usize,
1979 run_rt: bool,
1980 vcpu_affinity: Vec<usize>,
1981 no_smt: bool,
1982 start_barrier: Arc<Barrier>,
1983 has_bios: bool,
1984 io_bus: devices::Bus,
1985 mmio_bus: devices::Bus,
1986 exit_evt: Event,
1987 requires_pvclock_ctrl: bool,
1988 from_main_tube: mpsc::Receiver<VcpuControl>,
1989 use_hypervisor_signals: bool,
1990 #[cfg(all(target_arch = "x86_64", feature = "gdb"))] to_gdb_tube: Option<
1991 mpsc::Sender<VcpuDebugStatusMessage>,
1992 >,
1993 ) -> Result<JoinHandle<()>>
1994 where
1995 V: VcpuArch + 'static,
1996 {
1997 thread::Builder::new()
1998 .name(format!("crosvm_vcpu{}", cpu_id))
1999 .spawn(move || {
2000 // The VCPU thread must trigger the `exit_evt` in all paths, and a `ScopedEvent`'s Drop
2001 // implementation accomplishes that.
2002 let _scoped_exit_evt = ScopedEvent::from(exit_evt);
2003
2004 #[cfg(all(target_arch = "x86_64", feature = "gdb"))]
2005 let guest_mem = vm.get_memory().clone();
2006 let runnable_vcpu = runnable_vcpu(
2007 cpu_id,
2008 vcpu,
2009 vm,
2010 &mut irq_chip,
2011 vcpu_count,
2012 run_rt,
2013 vcpu_affinity,
2014 no_smt,
2015 has_bios,
2016 use_hypervisor_signals,
2017 );
2018
2019 start_barrier.wait();
2020
2021 let (vcpu, vcpu_run_handle) = match runnable_vcpu {
2022 Ok(v) => v,
2023 Err(e) => {
2024 error!("failed to start vcpu {}: {}", cpu_id, e);
2025 return;
2026 }
2027 };
2028
2029 let mut run_mode = VmRunMode::Running;
2030 #[cfg(all(target_arch = "x86_64", feature = "gdb"))]
2031 if to_gdb_tube.is_some() {
2032 // Wait until a GDB client attaches
2033 run_mode = VmRunMode::Breakpoint;
2034 }
2035
2036 let mut interrupted_by_signal = false;
2037
2038 'vcpu_loop: loop {
2039 // Start by checking for messages to process and the run state of the CPU.
2040 // An extra check here for Running so there isn't a need to call recv unless a
2041 // message is likely to be ready because a signal was sent.
2042 if interrupted_by_signal || run_mode != VmRunMode::Running {
2043 'state_loop: loop {
2044 // Tries to get a pending message without blocking first.
2045 let msg = match from_main_tube.try_recv() {
2046 Ok(m) => m,
2047 Err(mpsc::TryRecvError::Empty) if run_mode == VmRunMode::Running => {
2048 // If the VM is running and no message is pending, the state won't
2049 // change.
2050 break 'state_loop;
2051 }
2052 Err(mpsc::TryRecvError::Empty) => {
2053 // If the VM is not running, wait until a message is ready.
2054 match from_main_tube.recv() {
2055 Ok(m) => m,
2056 Err(mpsc::RecvError) => {
2057 error!("Failed to read from main tube in vcpu");
2058 break 'vcpu_loop;
2059 }
2060 }
2061 }
2062 Err(mpsc::TryRecvError::Disconnected) => {
2063 error!("Failed to read from main tube in vcpu");
2064 break 'vcpu_loop;
2065 }
2066 };
2067
2068 // Collect all pending messages.
2069 let mut messages = vec![msg];
2070 messages.append(&mut from_main_tube.try_iter().collect());
2071
2072 for msg in messages {
2073 match msg {
2074 VcpuControl::RunState(new_mode) => {
2075 run_mode = new_mode;
2076 match run_mode {
2077 VmRunMode::Running => break 'state_loop,
2078 VmRunMode::Suspending => {
2079 // On KVM implementations that use a paravirtualized
2080 // clock (e.g. x86), a flag must be set to indicate to
2081 // the guest kernel that a vCPU was suspended. The guest
2082 // kernel will use this flag to prevent the soft lockup
2083 // detection from triggering when this vCPU resumes,
2084 // which could happen days later in realtime.
2085 if requires_pvclock_ctrl {
2086 if let Err(e) = vcpu.pvclock_ctrl() {
2087 error!(
2088 "failed to tell hypervisor vcpu {} is suspending: {}",
2089 cpu_id, e
2090 );
2091 }
2092 }
2093 }
2094 VmRunMode::Breakpoint => {}
2095 VmRunMode::Exiting => break 'vcpu_loop,
2096 }
2097 }
2098 #[cfg(all(target_arch = "x86_64", feature = "gdb"))]
2099 VcpuControl::Debug(d) => {
2100 match &to_gdb_tube {
2101 Some(ref ch) => {
2102 if let Err(e) = handle_debug_msg(
2103 cpu_id, &vcpu, &guest_mem, d, &ch,
2104 ) {
2105 error!("Failed to handle gdb message: {}", e);
2106 }
2107 },
2108 None => {
2109 error!("VcpuControl::Debug received while GDB feature is disabled: {:?}", d);
2110 }
2111 }
2112 }
2113 }
2114 }
2115 }
2116 }
2117
2118 interrupted_by_signal = false;
2119
2120 // Vcpus may have run a HLT instruction, which puts them into a state other than
2121 // VcpuRunState::Runnable. In that case, this call to wait_until_runnable blocks
2122 // until either the irqchip receives an interrupt for this vcpu, or until the main
2123 // thread kicks this vcpu as a result of some VmControl operation. In most IrqChip
2124 // implementations HLT instructions do not make it to crosvm, and thus this is a
2125 // no-op that always returns VcpuRunState::Runnable.
2126 match irq_chip.wait_until_runnable(&vcpu) {
2127 Ok(VcpuRunState::Runnable) => {}
2128 Ok(VcpuRunState::Interrupted) => interrupted_by_signal = true,
2129 Err(e) => error!(
2130 "error waiting for vcpu {} to become runnable: {}",
2131 cpu_id, e
2132 ),
2133 }
2134
2135 if !interrupted_by_signal {
2136 match vcpu.run(&vcpu_run_handle) {
2137 Ok(VcpuExit::IoIn { port, mut size }) => {
2138 let mut data = [0; 8];
2139 if size > data.len() {
2140 error!("unsupported IoIn size of {} bytes", size);
2141 size = data.len();
2142 }
2143 io_bus.read(port as u64, &mut data[..size]);
2144 if let Err(e) = vcpu.set_data(&data[..size]) {
2145 error!("failed to set return data for IoIn: {}", e);
2146 }
2147 }
2148 Ok(VcpuExit::IoOut {
2149 port,
2150 mut size,
2151 data,
2152 }) => {
2153 if size > data.len() {
2154 error!("unsupported IoOut size of {} bytes", size);
2155 size = data.len();
2156 }
2157 io_bus.write(port as u64, &data[..size]);
2158 }
2159 Ok(VcpuExit::MmioRead { address, size }) => {
2160 let mut data = [0; 8];
2161 mmio_bus.read(address, &mut data[..size]);
2162 // Setting data for mmio can not fail.
2163 let _ = vcpu.set_data(&data[..size]);
2164 }
2165 Ok(VcpuExit::MmioWrite {
2166 address,
2167 size,
2168 data,
2169 }) => {
2170 mmio_bus.write(address, &data[..size]);
2171 }
2172 Ok(VcpuExit::IoapicEoi { vector }) => {
2173 if let Err(e) = irq_chip.broadcast_eoi(vector) {
2174 error!(
2175 "failed to broadcast eoi {} on vcpu {}: {}",
2176 vector, cpu_id, e
2177 );
2178 }
2179 }
2180 Ok(VcpuExit::IrqWindowOpen) => {}
2181 Ok(VcpuExit::Hlt) => irq_chip.halted(cpu_id),
2182 Ok(VcpuExit::Shutdown) => break,
2183 Ok(VcpuExit::FailEntry {
2184 hardware_entry_failure_reason,
2185 }) => {
2186 error!("vcpu hw run failure: {:#x}", hardware_entry_failure_reason);
2187 break;
2188 }
2189 Ok(VcpuExit::SystemEvent(_, _)) => break,
2190 Ok(VcpuExit::Debug { .. }) => {
2191 #[cfg(all(target_arch = "x86_64", feature = "gdb"))]
2192 {
2193 let msg = VcpuDebugStatusMessage {
2194 cpu: cpu_id as usize,
2195 msg: VcpuDebugStatus::HitBreakPoint,
2196 };
2197 if let Some(ref ch) = to_gdb_tube {
2198 if let Err(e) = ch.send(msg) {
2199 error!("failed to notify breakpoint to GDB thread: {}", e);
2200 break;
2201 }
2202 }
2203 run_mode = VmRunMode::Breakpoint;
2204 }
2205 }
2206 Ok(r) => warn!("unexpected vcpu exit: {:?}", r),
2207 Err(e) => match e.errno() {
2208 libc::EINTR => interrupted_by_signal = true,
2209 libc::EAGAIN => {}
2210 _ => {
2211 error!("vcpu hit unknown error: {}", e);
2212 break;
2213 }
2214 },
2215 }
2216 }
2217
2218 if interrupted_by_signal {
2219 if use_hypervisor_signals {
2220 // Try to clear the signal that we use to kick VCPU if it is pending before
2221 // attempting to handle pause requests.
2222 if let Err(e) = clear_signal(SIGRTMIN() + 0) {
2223 error!("failed to clear pending signal: {}", e);
2224 break;
2225 }
2226 } else {
2227 vcpu.set_immediate_exit(false);
2228 }
2229 }
2230
2231 if let Err(e) = irq_chip.inject_interrupts(&vcpu) {
2232 error!("failed to inject interrupts for vcpu {}: {}", cpu_id, e);
2233 }
2234 }
2235 })
2236 .map_err(Error::SpawnVcpu)
2237 }
2238
2239 // Reads the contents of a file and converts the space-separated fields into a Vec of i64s.
2240 // Returns an error if any of the fields fail to parse.
file_fields_to_i64<P: AsRef<Path>>(path: P) -> io::Result<Vec<i64>>2241 fn file_fields_to_i64<P: AsRef<Path>>(path: P) -> io::Result<Vec<i64>> {
2242 let mut file = File::open(path)?;
2243
2244 let mut buf = [0u8; 32];
2245 let count = file.read(&mut buf)?;
2246
2247 let content =
2248 str::from_utf8(&buf[..count]).map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
2249 content
2250 .trim()
2251 .split_whitespace()
2252 .map(|x| {
2253 x.parse::<i64>()
2254 .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))
2255 })
2256 .collect()
2257 }
2258
2259 // Reads the contents of a file and converts them into a u64, and if there
2260 // are multiple fields it only returns the first one.
file_to_i64<P: AsRef<Path>>(path: P, nth: usize) -> io::Result<i64>2261 fn file_to_i64<P: AsRef<Path>>(path: P, nth: usize) -> io::Result<i64> {
2262 file_fields_to_i64(path)?
2263 .into_iter()
2264 .nth(nth)
2265 .ok_or_else(|| io::Error::new(io::ErrorKind::InvalidData, "empty file"))
2266 }
2267
create_kvm_kernel_irq_chip( vm: &KvmVm, vcpu_count: usize, _ioapic_device_tube: Tube, ) -> base::Result<impl IrqChipArch>2268 fn create_kvm_kernel_irq_chip(
2269 vm: &KvmVm,
2270 vcpu_count: usize,
2271 _ioapic_device_tube: Tube,
2272 ) -> base::Result<impl IrqChipArch> {
2273 let irq_chip = KvmKernelIrqChip::new(vm.try_clone()?, vcpu_count)?;
2274 Ok(irq_chip)
2275 }
2276
2277 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
create_kvm_split_irq_chip( vm: &KvmVm, vcpu_count: usize, ioapic_device_tube: Tube, ) -> base::Result<impl IrqChipArch>2278 fn create_kvm_split_irq_chip(
2279 vm: &KvmVm,
2280 vcpu_count: usize,
2281 ioapic_device_tube: Tube,
2282 ) -> base::Result<impl IrqChipArch> {
2283 let irq_chip =
2284 KvmSplitIrqChip::new(vm.try_clone()?, vcpu_count, ioapic_device_tube, Some(120))?;
2285 Ok(irq_chip)
2286 }
2287
run_config(cfg: Config) -> Result<()>2288 pub fn run_config(cfg: Config) -> Result<()> {
2289 let components = setup_vm_components(&cfg)?;
2290
2291 let guest_mem_layout =
2292 Arch::guest_memory_layout(&components).map_err(Error::GuestMemoryLayout)?;
2293 let guest_mem = GuestMemory::new(&guest_mem_layout).unwrap();
2294 let mut mem_policy = MemoryPolicy::empty();
2295 if components.hugepages {
2296 mem_policy |= MemoryPolicy::USE_HUGEPAGES;
2297 }
2298 guest_mem.set_memory_policy(mem_policy);
2299 let kvm = Kvm::new_with_path(&cfg.kvm_device_path).map_err(Error::CreateKvm)?;
2300 let vm = KvmVm::new(&kvm, guest_mem).map_err(Error::CreateVm)?;
2301
2302 if cfg.split_irqchip {
2303 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
2304 {
2305 unimplemented!("KVM split irqchip mode only supported on x86 processors")
2306 }
2307
2308 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
2309 {
2310 run_vm::<KvmVcpu, _, _, _>(cfg, components, vm, create_kvm_split_irq_chip)
2311 }
2312 } else {
2313 run_vm::<KvmVcpu, _, _, _>(cfg, components, vm, create_kvm_kernel_irq_chip)
2314 }
2315 }
2316
setup_vm_components(cfg: &Config) -> Result<VmComponents>2317 fn setup_vm_components(cfg: &Config) -> Result<VmComponents> {
2318 let initrd_image = if let Some(initrd_path) = &cfg.initrd_path {
2319 Some(File::open(initrd_path).map_err(|e| Error::OpenInitrd(initrd_path.clone(), e))?)
2320 } else {
2321 None
2322 };
2323
2324 let vm_image = match cfg.executable_path {
2325 Some(Executable::Kernel(ref kernel_path)) => VmImage::Kernel(
2326 File::open(kernel_path).map_err(|e| Error::OpenKernel(kernel_path.to_path_buf(), e))?,
2327 ),
2328 Some(Executable::Bios(ref bios_path)) => VmImage::Bios(
2329 File::open(bios_path).map_err(|e| Error::OpenBios(bios_path.to_path_buf(), e))?,
2330 ),
2331 _ => panic!("Did not receive a bios or kernel, should be impossible."),
2332 };
2333
2334 Ok(VmComponents {
2335 memory_size: cfg
2336 .memory
2337 .unwrap_or(256)
2338 .checked_mul(1024 * 1024)
2339 .ok_or(Error::MemoryTooLarge)?,
2340 vcpu_count: cfg.vcpu_count.unwrap_or(1),
2341 vcpu_affinity: cfg.vcpu_affinity.clone(),
2342 no_smt: cfg.no_smt,
2343 hugepages: cfg.hugepages,
2344 vm_image,
2345 android_fstab: cfg
2346 .android_fstab
2347 .as_ref()
2348 .map(|x| File::open(x).map_err(|e| Error::OpenAndroidFstab(x.to_path_buf(), e)))
2349 .map_or(Ok(None), |v| v.map(Some))?,
2350 pstore: cfg.pstore.clone(),
2351 initrd_image,
2352 extra_kernel_params: cfg.params.clone(),
2353 wayland_dmabuf: cfg.wayland_dmabuf,
2354 acpi_sdts: cfg
2355 .acpi_tables
2356 .iter()
2357 .map(|path| SDT::from_file(path).map_err(|e| Error::OpenAcpiTable(path.clone(), e)))
2358 .collect::<Result<Vec<SDT>>>()?,
2359 rt_cpus: cfg.rt_cpus.clone(),
2360 protected_vm: cfg.protected_vm,
2361 #[cfg(all(target_arch = "x86_64", feature = "gdb"))]
2362 gdb: None,
2363 dmi_path: cfg.dmi_path.clone(),
2364 })
2365 }
2366
run_vm<Vcpu, V, I, FI>( cfg: Config, #[allow(unused_mut)] mut components: VmComponents, vm: V, create_irq_chip: FI, ) -> Result<()> where Vcpu: VcpuArch + 'static, V: VmArch + 'static, I: IrqChipArch + 'static, FI: FnOnce( &V, usize, Tube, ) -> base::Result<I>,2367 fn run_vm<Vcpu, V, I, FI>(
2368 cfg: Config,
2369 #[allow(unused_mut)] mut components: VmComponents,
2370 vm: V,
2371 create_irq_chip: FI,
2372 ) -> Result<()>
2373 where
2374 Vcpu: VcpuArch + 'static,
2375 V: VmArch + 'static,
2376 I: IrqChipArch + 'static,
2377 FI: FnOnce(
2378 &V,
2379 usize, // vcpu_count
2380 Tube, // ioapic_device_tube
2381 ) -> base::Result<I>,
2382 {
2383 if cfg.sandbox {
2384 // Printing something to the syslog before entering minijail so that libc's syslogger has a
2385 // chance to open files necessary for its operation, like `/etc/localtime`. After jailing,
2386 // access to those files will not be possible.
2387 info!("crosvm entering multiprocess mode");
2388 }
2389
2390 let (usb_control_tube, usb_provider) =
2391 HostBackendDeviceProvider::new().map_err(Error::CreateUsbProvider)?;
2392 // Masking signals is inherently dangerous, since this can persist across clones/execs. Do this
2393 // before any jailed devices have been spawned, so that we can catch any of them that fail very
2394 // quickly.
2395 let sigchld_fd = SignalFd::new(libc::SIGCHLD).map_err(Error::CreateSignalFd)?;
2396
2397 let control_server_socket = match &cfg.socket_path {
2398 Some(path) => Some(UnlinkUnixSeqpacketListener(
2399 UnixSeqpacketListener::bind(path).map_err(Error::CreateControlServer)?,
2400 )),
2401 None => None,
2402 };
2403
2404 let mut control_tubes = Vec::new();
2405
2406 #[cfg(all(target_arch = "x86_64", feature = "gdb"))]
2407 if let Some(port) = cfg.gdb {
2408 // GDB needs a control socket to interrupt vcpus.
2409 let (gdb_host_tube, gdb_control_tube) = Tube::pair().map_err(Error::CreateTube)?;
2410 control_tubes.push(TaggedControlTube::Vm(gdb_host_tube));
2411 components.gdb = Some((port, gdb_control_tube));
2412 }
2413
2414 let (wayland_host_tube, wayland_device_tube) = Tube::pair().map_err(Error::CreateTube)?;
2415 control_tubes.push(TaggedControlTube::VmMemory(wayland_host_tube));
2416 // Balloon gets a special socket so balloon requests can be forwarded from the main process.
2417 let (balloon_host_tube, balloon_device_tube) = Tube::pair().map_err(Error::CreateTube)?;
2418
2419 // Create one control socket per disk.
2420 let mut disk_device_tubes = Vec::new();
2421 let mut disk_host_tubes = Vec::new();
2422 let disk_count = cfg.disks.len();
2423 for _ in 0..disk_count {
2424 let (disk_host_tub, disk_device_tube) = Tube::pair().map_err(Error::CreateTube)?;
2425 disk_host_tubes.push(disk_host_tub);
2426 disk_device_tubes.push(disk_device_tube);
2427 }
2428
2429 let mut pmem_device_tubes = Vec::new();
2430 let pmem_count = cfg.pmem_devices.len();
2431 for _ in 0..pmem_count {
2432 let (pmem_host_tube, pmem_device_tube) = Tube::pair().map_err(Error::CreateTube)?;
2433 pmem_device_tubes.push(pmem_device_tube);
2434 control_tubes.push(TaggedControlTube::VmMsync(pmem_host_tube));
2435 }
2436
2437 let (gpu_host_tube, gpu_device_tube) = Tube::pair().map_err(Error::CreateTube)?;
2438 control_tubes.push(TaggedControlTube::VmMemory(gpu_host_tube));
2439
2440 let (ioapic_host_tube, ioapic_device_tube) = Tube::pair().map_err(Error::CreateTube)?;
2441 control_tubes.push(TaggedControlTube::VmIrq(ioapic_host_tube));
2442
2443 let battery = if cfg.battery_type.is_some() {
2444 let jail = match simple_jail(&cfg, "battery")? {
2445 #[cfg_attr(not(feature = "powerd-monitor-powerd"), allow(unused_mut))]
2446 Some(mut jail) => {
2447 // Setup a bind mount to the system D-Bus socket if the powerd monitor is used.
2448 #[cfg(feature = "power-monitor-powerd")]
2449 {
2450 add_crosvm_user_to_jail(&mut jail, "battery")?;
2451
2452 // Create a tmpfs in the device's root directory so that we can bind mount files.
2453 jail.mount_with_data(
2454 Path::new("none"),
2455 Path::new("/"),
2456 "tmpfs",
2457 (libc::MS_NOSUID | libc::MS_NODEV | libc::MS_NOEXEC) as usize,
2458 "size=67108864",
2459 )?;
2460
2461 let system_bus_socket_path = Path::new("/run/dbus/system_bus_socket");
2462 jail.mount_bind(system_bus_socket_path, system_bus_socket_path, true)?;
2463 }
2464 Some(jail)
2465 }
2466 None => None,
2467 };
2468 (&cfg.battery_type, jail)
2469 } else {
2470 (&cfg.battery_type, None)
2471 };
2472
2473 let gralloc = RutabagaGralloc::new().map_err(Error::CreateGrallocError)?;
2474 let map_request: Arc<Mutex<Option<ExternalMapping>>> = Arc::new(Mutex::new(None));
2475
2476 let fs_count = cfg
2477 .shared_dirs
2478 .iter()
2479 .filter(|sd| sd.kind == SharedDirKind::FS)
2480 .count();
2481 let mut fs_device_tubes = Vec::with_capacity(fs_count);
2482 for _ in 0..fs_count {
2483 let (fs_host_tube, fs_device_tube) = Tube::pair().map_err(Error::CreateTube)?;
2484 control_tubes.push(TaggedControlTube::Fs(fs_host_tube));
2485 fs_device_tubes.push(fs_device_tube);
2486 }
2487
2488 #[cfg_attr(not(feature = "direct"), allow(unused_mut))]
2489 let mut linux: RunnableLinuxVm<_, Vcpu, _> = Arch::build_vm(
2490 components,
2491 &cfg.serial_parameters,
2492 simple_jail(&cfg, "serial")?,
2493 battery,
2494 vm,
2495 |mem, vm, sys_allocator, exit_evt| {
2496 create_devices(
2497 &cfg,
2498 mem,
2499 vm,
2500 sys_allocator,
2501 exit_evt,
2502 &mut control_tubes,
2503 wayland_device_tube,
2504 gpu_device_tube,
2505 balloon_device_tube,
2506 &mut disk_device_tubes,
2507 &mut pmem_device_tubes,
2508 &mut fs_device_tubes,
2509 usb_provider,
2510 Arc::clone(&map_request),
2511 )
2512 },
2513 |vm, vcpu_count| create_irq_chip(vm, vcpu_count, ioapic_device_tube),
2514 )
2515 .map_err(Error::BuildVm)?;
2516
2517 #[cfg(feature = "direct")]
2518 if let Some(pmio) = &cfg.direct_pmio {
2519 let direct_io =
2520 Arc::new(devices::DirectIo::new(&pmio.path, false).map_err(Error::DirectIo)?);
2521 for range in pmio.ranges.iter() {
2522 linux
2523 .io_bus
2524 .insert_sync(direct_io.clone(), range.0, range.1)
2525 .unwrap();
2526 }
2527 };
2528
2529 #[cfg(feature = "direct")]
2530 let mut irqs = Vec::new();
2531
2532 #[cfg(feature = "direct")]
2533 for irq in &cfg.direct_level_irq {
2534 if !linux.resources.reserve_irq(*irq) {
2535 warn!("irq {} already reserved.", irq);
2536 }
2537 let trigger = Event::new().map_err(Error::CreateEvent)?;
2538 let resample = Event::new().map_err(Error::CreateEvent)?;
2539 linux
2540 .irq_chip
2541 .register_irq_event(*irq, &trigger, Some(&resample))
2542 .unwrap();
2543 let direct_irq =
2544 devices::DirectIrq::new(trigger, Some(resample)).map_err(Error::DirectIrq)?;
2545 direct_irq.irq_enable(*irq).map_err(Error::DirectIrq)?;
2546 irqs.push(direct_irq);
2547 }
2548
2549 #[cfg(feature = "direct")]
2550 for irq in &cfg.direct_edge_irq {
2551 if !linux.resources.reserve_irq(*irq) {
2552 warn!("irq {} already reserved.", irq);
2553 }
2554 let trigger = Event::new().map_err(Error::CreateEvent)?;
2555 linux
2556 .irq_chip
2557 .register_irq_event(*irq, &trigger, None)
2558 .unwrap();
2559 let direct_irq = devices::DirectIrq::new(trigger, None).map_err(Error::DirectIrq)?;
2560 direct_irq.irq_enable(*irq).map_err(Error::DirectIrq)?;
2561 irqs.push(direct_irq);
2562 }
2563
2564 run_control(
2565 linux,
2566 control_server_socket,
2567 control_tubes,
2568 balloon_host_tube,
2569 &disk_host_tubes,
2570 usb_control_tube,
2571 sigchld_fd,
2572 cfg.sandbox,
2573 Arc::clone(&map_request),
2574 cfg.balloon_bias,
2575 gralloc,
2576 )
2577 }
2578
2579 /// Signals all running VCPUs to vmexit, sends VmRunMode message to each VCPU tube, and tells
2580 /// `irq_chip` to stop blocking halted VCPUs. The tube message is set first because both the
2581 /// signal and the irq_chip kick could cause the VCPU thread to continue through the VCPU run
2582 /// loop.
kick_all_vcpus( vcpu_handles: &[(JoinHandle<()>, mpsc::Sender<vm_control::VcpuControl>)], irq_chip: &impl IrqChip, run_mode: &VmRunMode, )2583 fn kick_all_vcpus(
2584 vcpu_handles: &[(JoinHandle<()>, mpsc::Sender<vm_control::VcpuControl>)],
2585 irq_chip: &impl IrqChip,
2586 run_mode: &VmRunMode,
2587 ) {
2588 for (handle, tube) in vcpu_handles {
2589 if let Err(e) = tube.send(VcpuControl::RunState(run_mode.clone())) {
2590 error!("failed to send VmRunMode: {}", e);
2591 }
2592 let _ = handle.kill(SIGRTMIN() + 0);
2593 }
2594 irq_chip.kick_halted_vcpus();
2595 }
2596
2597 // BalloonPolicy determines the size to set the balloon.
2598 struct BalloonPolicy {
2599 // Estimate for when the guest starts aggressivly freeing memory.
2600 critical_guest_available: i64,
2601 critical_host_available: i64, // ChromeOS critical margin.
2602 guest_available_bias: i64,
2603 max_balloon_actual: i64, // The largest the balloon has ever been observed.
2604 prev_balloon_full_percent: i64, // How full was the balloon at the previous timestep.
2605 prev_guest_available: i64, // Available memory in the guest at the previous timestep.
2606 }
2607
2608 const ONE_KB: i64 = 1024;
2609 const ONE_MB: i64 = 1024 * ONE_KB;
2610
2611 const LOWMEM_AVAILABLE: &str = "/sys/kernel/mm/chromeos-low_mem/available";
2612 const LOWMEM_MARGIN: &str = "/sys/kernel/mm/chromeos-low_mem/margin";
2613
2614 // BalloonPolicy implements the virtio balloon sizing logic.
2615 // The balloon is sized with the following heuristics:
2616 // Balance Available
2617 // The balloon is sized to balance the amount of available memory above a
2618 // critical margin. The critical margin is the level at which memory is
2619 // freed. In the host, this is the ChromeOS available critical margin, which
2620 // is the trigger to kill tabs. In the guest, we estimate this level by
2621 // tracking the minimum amount of available memory, discounting sharp
2622 // 'valleys'. If the guest manages to keep available memory above a given
2623 // level even with some pressure, then we determine that this is the
2624 // 'critical' level for the guest. We don't update this critical value if
2625 // the balloon is fully inflated because in that case, the guest may be out
2626 // of memory to free.
2627 // guest_available_bias
2628 // Even if available memory is perfectly balanced between host and guest,
2629 // The size of the balloon will still drift randomly depending on whether
2630 // those host or guest reclaims memory first/faster every time memory is
2631 // low. To encourage large balloons to shrink and small balloons to grow,
2632 // the following bias is added to the guest critical margin:
2633 // (guest_available_bias * balloon_full_percent) / 100
2634 // This give the guest more memory when the balloon is full.
2635 impl BalloonPolicy {
new( memory_size: i64, critical_host_available: i64, guest_available_bias: i64, ) -> BalloonPolicy2636 fn new(
2637 memory_size: i64,
2638 critical_host_available: i64,
2639 guest_available_bias: i64,
2640 ) -> BalloonPolicy {
2641 // Estimate some reasonable initial maximum for balloon size.
2642 let max_balloon_actual = (memory_size * 3) / 4;
2643 // 400MB is above the zone min margin even for Crostini VMs on 16GB
2644 // devices (~85MB), and is above when Android Low Memory Killer kills
2645 // apps (~250MB).
2646 let critical_guest_available = 400 * ONE_MB;
2647
2648 BalloonPolicy {
2649 critical_guest_available,
2650 critical_host_available,
2651 guest_available_bias,
2652 max_balloon_actual,
2653 prev_balloon_full_percent: 0,
2654 prev_guest_available: 0,
2655 }
2656 }
delta(&mut self, stats: BalloonStats, balloon_actual_u: u64) -> Result<i64>2657 fn delta(&mut self, stats: BalloonStats, balloon_actual_u: u64) -> Result<i64> {
2658 let guest_free = stats
2659 .free_memory
2660 .map(i64::try_from)
2661 .ok_or(Error::GuestFreeMissing())?
2662 .map_err(Error::GuestFreeTooLarge)?;
2663 let guest_cached = stats
2664 .disk_caches
2665 .map(i64::try_from)
2666 .ok_or(Error::GuestFreeMissing())?
2667 .map_err(Error::GuestFreeTooLarge)?;
2668 let balloon_actual = match balloon_actual_u {
2669 size if size < i64::max_value() as u64 => size as i64,
2670 _ => return Err(Error::BalloonActualTooLarge),
2671 };
2672 let guest_available = guest_free + guest_cached;
2673 // Available memory is reported in MB, and we need bytes.
2674 let host_available =
2675 file_to_i64(LOWMEM_AVAILABLE, 0).map_err(Error::ReadMemAvailable)? * ONE_MB;
2676 if self.max_balloon_actual < balloon_actual {
2677 self.max_balloon_actual = balloon_actual;
2678 info!(
2679 "balloon updated max_balloon_actual to {} MiB",
2680 self.max_balloon_actual / ONE_MB,
2681 );
2682 }
2683 let balloon_full_percent = balloon_actual * 100 / self.max_balloon_actual;
2684 // Update critical_guest_available if we see a lower available with the
2685 // balloon not fully inflated. If the balloon is completely inflated
2686 // there is a risk that the low available level we see comes at the cost
2687 // of stability. The Linux OOM Killer might have been forced to kill
2688 // something important, or page reclaim was so aggressive that there are
2689 // long UI hangs.
2690 if guest_available < self.critical_guest_available && balloon_full_percent < 95 {
2691 // To ignore temporary low memory states, we require that two guest
2692 // available measurements in a row are low.
2693 if self.prev_guest_available < self.critical_guest_available
2694 && self.prev_balloon_full_percent < 95
2695 {
2696 self.critical_guest_available = self.prev_guest_available;
2697 info!(
2698 "balloon updated critical_guest_available to {} MiB",
2699 self.critical_guest_available / ONE_MB,
2700 );
2701 }
2702 }
2703
2704 // Compute the difference in available memory above the host and guest
2705 // critical thresholds.
2706 let bias = (self.guest_available_bias * balloon_full_percent) / 100;
2707 let guest_above_critical = guest_available - self.critical_guest_available - bias;
2708 let host_above_critical = host_available - self.critical_host_available;
2709 let balloon_delta = guest_above_critical - host_above_critical;
2710 // Only let the balloon take up MAX_CRITICAL_DELTA of available memory
2711 // below the critical level in host or guest.
2712 const MAX_CRITICAL_DELTA: i64 = 10 * ONE_MB;
2713 let balloon_delta_capped = if balloon_delta < 0 {
2714 // The balloon is deflating, taking memory from the host. Don't let
2715 // it take more than the amount of available memory above the
2716 // critical margin, plus MAX_CRITICAL_DELTA.
2717 max(
2718 balloon_delta,
2719 -(host_available - self.critical_host_available + MAX_CRITICAL_DELTA),
2720 )
2721 } else {
2722 // The balloon is inflating, taking memory from the guest. Don't let
2723 // it take more than the amount of available memory above the
2724 // critical margin, plus MAX_CRITICAL_DELTA.
2725 min(
2726 balloon_delta,
2727 guest_available - self.critical_guest_available + MAX_CRITICAL_DELTA,
2728 )
2729 };
2730
2731 self.prev_balloon_full_percent = balloon_full_percent;
2732 self.prev_guest_available = guest_available;
2733
2734 // Only return a value if target would change available above critical
2735 // by more than 1%, or we are within 1 MB of critical in host or guest.
2736 if guest_above_critical < ONE_MB
2737 || host_above_critical < ONE_MB
2738 || (balloon_delta.abs() * 100) / guest_above_critical > 1
2739 || (balloon_delta.abs() * 100) / host_above_critical > 1
2740 {
2741 // Finally, make sure the balloon delta won't cause a negative size.
2742 let result = max(balloon_delta_capped, -balloon_actual);
2743 if result != 0 {
2744 info!(
2745 "balloon delta={:<6} ha={:<6} hc={:<6} ga={:<6} gc={:<6} bias={:<6} full={:>3}%",
2746 result / ONE_MB,
2747 host_available / ONE_MB,
2748 self.critical_host_available / ONE_MB,
2749 guest_available / ONE_MB,
2750 self.critical_guest_available / ONE_MB,
2751 bias / ONE_MB,
2752 balloon_full_percent,
2753 );
2754 }
2755 return Ok(result);
2756 }
2757 Ok(0)
2758 }
2759 }
2760
run_control<V: VmArch + 'static, Vcpu: VcpuArch + 'static, I: IrqChipArch + 'static>( mut linux: RunnableLinuxVm<V, Vcpu, I>, control_server_socket: Option<UnlinkUnixSeqpacketListener>, mut control_tubes: Vec<TaggedControlTube>, balloon_host_tube: Tube, disk_host_tubes: &[Tube], usb_control_tube: Tube, sigchld_fd: SignalFd, sandbox: bool, map_request: Arc<Mutex<Option<ExternalMapping>>>, balloon_bias: i64, mut gralloc: RutabagaGralloc, ) -> Result<()>2761 fn run_control<V: VmArch + 'static, Vcpu: VcpuArch + 'static, I: IrqChipArch + 'static>(
2762 mut linux: RunnableLinuxVm<V, Vcpu, I>,
2763 control_server_socket: Option<UnlinkUnixSeqpacketListener>,
2764 mut control_tubes: Vec<TaggedControlTube>,
2765 balloon_host_tube: Tube,
2766 disk_host_tubes: &[Tube],
2767 usb_control_tube: Tube,
2768 sigchld_fd: SignalFd,
2769 sandbox: bool,
2770 map_request: Arc<Mutex<Option<ExternalMapping>>>,
2771 balloon_bias: i64,
2772 mut gralloc: RutabagaGralloc,
2773 ) -> Result<()> {
2774 #[derive(PollToken)]
2775 enum Token {
2776 Exit,
2777 Suspend,
2778 ChildSignal,
2779 IrqFd { index: IrqEventIndex },
2780 BalanceMemory,
2781 BalloonResult,
2782 VmControlServer,
2783 VmControl { index: usize },
2784 }
2785
2786 stdin()
2787 .set_raw_mode()
2788 .expect("failed to set terminal raw mode");
2789
2790 let wait_ctx = WaitContext::build_with(&[
2791 (&linux.exit_evt, Token::Exit),
2792 (&linux.suspend_evt, Token::Suspend),
2793 (&sigchld_fd, Token::ChildSignal),
2794 ])
2795 .map_err(Error::WaitContextAdd)?;
2796
2797 if let Some(socket_server) = &control_server_socket {
2798 wait_ctx
2799 .add(socket_server, Token::VmControlServer)
2800 .map_err(Error::WaitContextAdd)?;
2801 }
2802 for (index, socket) in control_tubes.iter().enumerate() {
2803 wait_ctx
2804 .add(socket.as_ref(), Token::VmControl { index })
2805 .map_err(Error::WaitContextAdd)?;
2806 }
2807
2808 let events = linux
2809 .irq_chip
2810 .irq_event_tokens()
2811 .map_err(Error::WaitContextAdd)?;
2812
2813 for (index, _gsi, evt) in events {
2814 wait_ctx
2815 .add(&evt, Token::IrqFd { index })
2816 .map_err(Error::WaitContextAdd)?;
2817 }
2818
2819 // Balance available memory between guest and host every second.
2820 let mut balancemem_timer = Timer::new().map_err(Error::CreateTimer)?;
2821 let mut balloon_policy = if let Ok(critical_margin) = file_to_i64(LOWMEM_MARGIN, 0) {
2822 // Create timer request balloon stats every 1s.
2823 wait_ctx
2824 .add(&balancemem_timer, Token::BalanceMemory)
2825 .map_err(Error::WaitContextAdd)?;
2826 let balancemem_dur = Duration::from_secs(1);
2827 let balancemem_int = Duration::from_secs(1);
2828 balancemem_timer
2829 .reset(balancemem_dur, Some(balancemem_int))
2830 .map_err(Error::ResetTimer)?;
2831
2832 // Listen for balloon statistics from the guest so we can balance.
2833 wait_ctx
2834 .add(&balloon_host_tube, Token::BalloonResult)
2835 .map_err(Error::WaitContextAdd)?;
2836 Some(BalloonPolicy::new(
2837 linux.vm.get_memory().memory_size() as i64,
2838 critical_margin * ONE_MB,
2839 balloon_bias,
2840 ))
2841 } else {
2842 warn!("Unable to open low mem margin, maybe not a chrome os kernel");
2843 None
2844 };
2845
2846 if sandbox {
2847 // Before starting VCPUs, in case we started with some capabilities, drop them all.
2848 drop_capabilities().map_err(Error::DropCapabilities)?;
2849 }
2850
2851 #[cfg(all(target_arch = "x86_64", feature = "gdb"))]
2852 // Create a channel for GDB thread.
2853 let (to_gdb_channel, from_vcpu_channel) = if linux.gdb.is_some() {
2854 let (s, r) = mpsc::channel();
2855 (Some(s), Some(r))
2856 } else {
2857 (None, None)
2858 };
2859
2860 let mut vcpu_handles = Vec::with_capacity(linux.vcpu_count);
2861 let vcpu_thread_barrier = Arc::new(Barrier::new(linux.vcpu_count + 1));
2862 let use_hypervisor_signals = !linux
2863 .vm
2864 .get_hypervisor()
2865 .check_capability(&HypervisorCap::ImmediateExit);
2866 setup_vcpu_signal_handler::<Vcpu>(use_hypervisor_signals)?;
2867
2868 let vcpus: Vec<Option<_>> = match linux.vcpus.take() {
2869 Some(vec) => vec.into_iter().map(Some).collect(),
2870 None => iter::repeat_with(|| None).take(linux.vcpu_count).collect(),
2871 };
2872 for (cpu_id, vcpu) in vcpus.into_iter().enumerate() {
2873 let (to_vcpu_channel, from_main_channel) = mpsc::channel();
2874 let vcpu_affinity = match linux.vcpu_affinity.clone() {
2875 Some(VcpuAffinity::Global(v)) => v,
2876 Some(VcpuAffinity::PerVcpu(mut m)) => m.remove(&cpu_id).unwrap_or_default(),
2877 None => Default::default(),
2878 };
2879 let handle = run_vcpu(
2880 cpu_id,
2881 vcpu,
2882 linux.vm.try_clone().map_err(Error::CloneEvent)?,
2883 linux.irq_chip.try_clone().map_err(Error::CloneEvent)?,
2884 linux.vcpu_count,
2885 linux.rt_cpus.contains(&cpu_id),
2886 vcpu_affinity,
2887 linux.no_smt,
2888 vcpu_thread_barrier.clone(),
2889 linux.has_bios,
2890 linux.io_bus.clone(),
2891 linux.mmio_bus.clone(),
2892 linux.exit_evt.try_clone().map_err(Error::CloneEvent)?,
2893 linux.vm.check_capability(VmCap::PvClockSuspend),
2894 from_main_channel,
2895 use_hypervisor_signals,
2896 #[cfg(all(target_arch = "x86_64", feature = "gdb"))]
2897 to_gdb_channel.clone(),
2898 )?;
2899 vcpu_handles.push((handle, to_vcpu_channel));
2900 }
2901
2902 #[cfg(all(target_arch = "x86_64", feature = "gdb"))]
2903 // Spawn GDB thread.
2904 if let Some((gdb_port_num, gdb_control_tube)) = linux.gdb.take() {
2905 let to_vcpu_channels = vcpu_handles
2906 .iter()
2907 .map(|(_handle, channel)| channel.clone())
2908 .collect();
2909 let target = GdbStub::new(
2910 gdb_control_tube,
2911 to_vcpu_channels,
2912 from_vcpu_channel.unwrap(), // Must succeed to unwrap()
2913 );
2914 thread::Builder::new()
2915 .name("gdb".to_owned())
2916 .spawn(move || gdb_thread(target, gdb_port_num))
2917 .map_err(Error::SpawnGdbServer)?;
2918 };
2919
2920 vcpu_thread_barrier.wait();
2921
2922 'wait: loop {
2923 let events = {
2924 match wait_ctx.wait() {
2925 Ok(v) => v,
2926 Err(e) => {
2927 error!("failed to poll: {}", e);
2928 break;
2929 }
2930 }
2931 };
2932
2933 if let Err(e) = linux.irq_chip.process_delayed_irq_events() {
2934 warn!("can't deliver delayed irqs: {}", e);
2935 }
2936
2937 let mut vm_control_indices_to_remove = Vec::new();
2938 for event in events.iter().filter(|e| e.is_readable) {
2939 match event.token {
2940 Token::Exit => {
2941 info!("vcpu requested shutdown");
2942 break 'wait;
2943 }
2944 Token::Suspend => {
2945 info!("VM requested suspend");
2946 linux.suspend_evt.read().unwrap();
2947 kick_all_vcpus(&vcpu_handles, &linux.irq_chip, &VmRunMode::Suspending);
2948 }
2949 Token::ChildSignal => {
2950 // Print all available siginfo structs, then exit the loop.
2951 while let Some(siginfo) = sigchld_fd.read().map_err(Error::SignalFd)? {
2952 let pid = siginfo.ssi_pid;
2953 let pid_label = match linux.pid_debug_label_map.get(&pid) {
2954 Some(label) => format!("{} (pid {})", label, pid),
2955 None => format!("pid {}", pid),
2956 };
2957 error!(
2958 "child {} died: signo {}, status {}, code {}",
2959 pid_label, siginfo.ssi_signo, siginfo.ssi_status, siginfo.ssi_code
2960 );
2961 }
2962 break 'wait;
2963 }
2964 Token::IrqFd { index } => {
2965 if let Err(e) = linux.irq_chip.service_irq_event(index) {
2966 error!("failed to signal irq {}: {}", index, e);
2967 }
2968 }
2969 Token::BalanceMemory => {
2970 balancemem_timer.wait().map_err(Error::Timer)?;
2971 let command = BalloonControlCommand::Stats {};
2972 if let Err(e) = balloon_host_tube.send(&command) {
2973 warn!("failed to send stats request to balloon device: {}", e);
2974 }
2975 }
2976 Token::BalloonResult => {
2977 match balloon_host_tube.recv() {
2978 Ok(BalloonControlResult::Stats {
2979 stats,
2980 balloon_actual: balloon_actual_u,
2981 }) => {
2982 match balloon_policy
2983 .as_mut()
2984 .map(|p| p.delta(stats, balloon_actual_u))
2985 {
2986 None => {
2987 error!(
2988 "got result from balloon stats, but no policy is running"
2989 );
2990 }
2991 Some(Err(e)) => {
2992 warn!("failed to run balloon policy {}", e);
2993 }
2994 Some(Ok(delta)) if delta != 0 => {
2995 let target = max((balloon_actual_u as i64) + delta, 0) as u64;
2996 let command =
2997 BalloonControlCommand::Adjust { num_bytes: target };
2998 if let Err(e) = balloon_host_tube.send(&command) {
2999 warn!(
3000 "failed to send memory value to balloon device: {}",
3001 e
3002 );
3003 }
3004 }
3005 Some(Ok(_)) => {}
3006 }
3007 }
3008 Err(e) => {
3009 error!("failed to recv BalloonControlResult: {}", e);
3010 }
3011 };
3012 }
3013 Token::VmControlServer => {
3014 if let Some(socket_server) = &control_server_socket {
3015 match socket_server.accept() {
3016 Ok(socket) => {
3017 wait_ctx
3018 .add(
3019 &socket,
3020 Token::VmControl {
3021 index: control_tubes.len(),
3022 },
3023 )
3024 .map_err(Error::WaitContextAdd)?;
3025 control_tubes.push(TaggedControlTube::Vm(Tube::new(socket)));
3026 }
3027 Err(e) => error!("failed to accept socket: {}", e),
3028 }
3029 }
3030 }
3031 Token::VmControl { index } => {
3032 if let Some(socket) = control_tubes.get(index) {
3033 match socket {
3034 TaggedControlTube::Vm(tube) => match tube.recv::<VmRequest>() {
3035 Ok(request) => {
3036 let mut run_mode_opt = None;
3037 let response = request.execute(
3038 &mut run_mode_opt,
3039 &balloon_host_tube,
3040 disk_host_tubes,
3041 &usb_control_tube,
3042 &mut linux.bat_control,
3043 );
3044 if let Err(e) = tube.send(&response) {
3045 error!("failed to send VmResponse: {}", e);
3046 }
3047 if let Some(run_mode) = run_mode_opt {
3048 info!("control socket changed run mode to {}", run_mode);
3049 match run_mode {
3050 VmRunMode::Exiting => {
3051 break 'wait;
3052 }
3053 other => {
3054 if other == VmRunMode::Running {
3055 linux.io_bus.notify_resume();
3056 }
3057 kick_all_vcpus(
3058 &vcpu_handles,
3059 &linux.irq_chip,
3060 &other,
3061 );
3062 }
3063 }
3064 }
3065 }
3066 Err(e) => {
3067 if let TubeError::Disconnected = e {
3068 vm_control_indices_to_remove.push(index);
3069 } else {
3070 error!("failed to recv VmRequest: {}", e);
3071 }
3072 }
3073 },
3074 TaggedControlTube::VmMemory(tube) => {
3075 match tube.recv::<VmMemoryRequest>() {
3076 Ok(request) => {
3077 let response = request.execute(
3078 &mut linux.vm,
3079 &mut linux.resources,
3080 Arc::clone(&map_request),
3081 &mut gralloc,
3082 );
3083 if let Err(e) = tube.send(&response) {
3084 error!("failed to send VmMemoryControlResponse: {}", e);
3085 }
3086 }
3087 Err(e) => {
3088 if let TubeError::Disconnected = e {
3089 vm_control_indices_to_remove.push(index);
3090 } else {
3091 error!("failed to recv VmMemoryControlRequest: {}", e);
3092 }
3093 }
3094 }
3095 }
3096 TaggedControlTube::VmIrq(tube) => match tube.recv::<VmIrqRequest>() {
3097 Ok(request) => {
3098 let response = {
3099 let irq_chip = &mut linux.irq_chip;
3100 request.execute(
3101 |setup| match setup {
3102 IrqSetup::Event(irq, ev) => {
3103 if let Some(event_index) = irq_chip
3104 .register_irq_event(irq, ev, None)?
3105 {
3106 match wait_ctx.add(
3107 ev,
3108 Token::IrqFd {
3109 index: event_index
3110 },
3111 ) {
3112 Err(e) => {
3113 warn!("failed to add IrqFd to poll context: {}", e);
3114 Err(e)
3115 },
3116 Ok(_) => {
3117 Ok(())
3118 }
3119 }
3120 } else {
3121 Ok(())
3122 }
3123 }
3124 IrqSetup::Route(route) => irq_chip.route_irq(route),
3125 },
3126 &mut linux.resources,
3127 )
3128 };
3129 if let Err(e) = tube.send(&response) {
3130 error!("failed to send VmIrqResponse: {}", e);
3131 }
3132 }
3133 Err(e) => {
3134 if let TubeError::Disconnected = e {
3135 vm_control_indices_to_remove.push(index);
3136 } else {
3137 error!("failed to recv VmIrqRequest: {}", e);
3138 }
3139 }
3140 },
3141 TaggedControlTube::VmMsync(tube) => {
3142 match tube.recv::<VmMsyncRequest>() {
3143 Ok(request) => {
3144 let response = request.execute(&mut linux.vm);
3145 if let Err(e) = tube.send(&response) {
3146 error!("failed to send VmMsyncResponse: {}", e);
3147 }
3148 }
3149 Err(e) => {
3150 if let TubeError::Disconnected = e {
3151 vm_control_indices_to_remove.push(index);
3152 } else {
3153 error!("failed to recv VmMsyncRequest: {}", e);
3154 }
3155 }
3156 }
3157 }
3158 TaggedControlTube::Fs(tube) => match tube.recv::<FsMappingRequest>() {
3159 Ok(request) => {
3160 let response =
3161 request.execute(&mut linux.vm, &mut linux.resources);
3162 if let Err(e) = tube.send(&response) {
3163 error!("failed to send VmResponse: {}", e);
3164 }
3165 }
3166 Err(e) => {
3167 if let TubeError::Disconnected = e {
3168 vm_control_indices_to_remove.push(index);
3169 } else {
3170 error!("failed to recv VmResponse: {}", e);
3171 }
3172 }
3173 },
3174 }
3175 }
3176 }
3177 }
3178 }
3179
3180 for event in events.iter().filter(|e| e.is_hungup) {
3181 match event.token {
3182 Token::Exit => {}
3183 Token::Suspend => {}
3184 Token::ChildSignal => {}
3185 Token::IrqFd { index: _ } => {}
3186 Token::BalanceMemory => {}
3187 Token::BalloonResult => {}
3188 Token::VmControlServer => {}
3189 Token::VmControl { index } => {
3190 // It's possible more data is readable and buffered while the socket is hungup,
3191 // so don't delete the tube from the poll context until we're sure all the
3192 // data is read.
3193 if control_tubes
3194 .get(index)
3195 .map(|s| !s.as_ref().is_packet_ready())
3196 .unwrap_or(false)
3197 {
3198 vm_control_indices_to_remove.push(index);
3199 }
3200 }
3201 }
3202 }
3203
3204 // Sort in reverse so the highest indexes are removed first. This removal algorithm
3205 // preserves correct indexes as each element is removed.
3206 vm_control_indices_to_remove.sort_unstable_by_key(|&k| Reverse(k));
3207 vm_control_indices_to_remove.dedup();
3208 for index in vm_control_indices_to_remove {
3209 // Delete the socket from the `wait_ctx` synchronously. Otherwise, the kernel will do
3210 // this automatically when the FD inserted into the `wait_ctx` is closed after this
3211 // if-block, but this removal can be deferred unpredictably. In some instances where the
3212 // system is under heavy load, we can even get events returned by `wait_ctx` for an FD
3213 // that has already been closed. Because the token associated with that spurious event
3214 // now belongs to a different socket, the control loop will start to interact with
3215 // sockets that might not be ready to use. This can cause incorrect hangup detection or
3216 // blocking on a socket that will never be ready. See also: crbug.com/1019986
3217 if let Some(socket) = control_tubes.get(index) {
3218 wait_ctx.delete(socket).map_err(Error::WaitContextDelete)?;
3219 }
3220
3221 // This line implicitly drops the socket at `index` when it gets returned by
3222 // `swap_remove`. After this line, the socket at `index` is not the one from
3223 // `vm_control_indices_to_remove`. Because of this socket's change in index, we need to
3224 // use `wait_ctx.modify` to change the associated index in its `Token::VmControl`.
3225 control_tubes.swap_remove(index);
3226 if let Some(tube) = control_tubes.get(index) {
3227 wait_ctx
3228 .modify(tube, EventType::Read, Token::VmControl { index })
3229 .map_err(Error::WaitContextAdd)?;
3230 }
3231 }
3232 }
3233
3234 kick_all_vcpus(&vcpu_handles, &linux.irq_chip, &VmRunMode::Exiting);
3235 for (handle, _) in vcpu_handles {
3236 if let Err(e) = handle.join() {
3237 error!("failed to join vcpu thread: {:?}", e);
3238 }
3239 }
3240
3241 // Explicitly drop the VM structure here to allow the devices to clean up before the
3242 // control sockets are closed when this function exits.
3243 mem::drop(linux);
3244
3245 stdin()
3246 .set_canon_mode()
3247 .expect("failed to restore canonical mode for terminal");
3248
3249 Ok(())
3250 }
3251