1 // Copyright 2018 The Chromium OS Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 mod process;
6 mod vcpu;
7
8 use std::fs::File;
9 use std::io;
10 use std::io::Read;
11 use std::os::unix::net::UnixDatagram;
12 use std::path::Path;
13 use std::sync::atomic::{AtomicBool, Ordering};
14 use std::sync::{Arc, Barrier};
15 use std::thread;
16 use std::time::{Duration, Instant};
17
18 use libc::{
19 c_int, c_ulong, fcntl, ioctl, socketpair, AF_UNIX, EAGAIN, EBADF, EDEADLK, EEXIST, EINTR,
20 EINVAL, ENOENT, EOVERFLOW, EPERM, FIOCLEX, F_SETPIPE_SZ, MS_NODEV, MS_NOEXEC, MS_NOSUID,
21 MS_RDONLY, O_NONBLOCK, SIGCHLD, SOCK_SEQPACKET,
22 };
23
24 use anyhow::{anyhow, bail, Context, Result};
25 use protobuf::ProtobufError;
26 use remain::sorted;
27 use thiserror::Error;
28
29 use base::{
30 add_fd_flags, block_signal, clear_signal, drop_capabilities, enable_core_scheduling, error,
31 getegid, geteuid, info, pipe, register_rt_signal_handler, validate_raw_descriptor, warn,
32 AsRawDescriptor, Descriptor, Error as SysError, Event, FromRawDescriptor, Killable, MmapError,
33 PollToken, RawDescriptor, Result as SysResult, SignalFd, WaitContext, SIGRTMIN,
34 };
35 use kvm::{Cap, Datamatch, IoeventAddress, Kvm, Vcpu, VcpuExit, Vm};
36 use minijail::{self, Minijail};
37 use net_util::{Tap, TapT};
38 use vm_memory::{GuestMemory, MemoryPolicy};
39
40 use self::process::*;
41 use self::vcpu::*;
42 use crate::{Config, Executable};
43
44 const MAX_DATAGRAM_SIZE: usize = 4096;
45 const MAX_VCPU_DATAGRAM_SIZE: usize = 0x40000;
46 const CROSVM_GPU_SERVER_FD_ENV: &str = "CROSVM_GPU_SERVER_FD";
47
48 /// An error that occurs when communicating with the plugin process.
49 #[sorted]
50 #[derive(Error, Debug)]
51 pub enum CommError {
52 #[error("failed to decode plugin request: {0}")]
53 DecodeRequest(ProtobufError),
54 #[error("failed to encode plugin response: {0}")]
55 EncodeResponse(ProtobufError),
56 #[error("plugin request socket has been hung up")]
57 PluginSocketHup,
58 #[error("failed to recv from plugin request socket: {0}")]
59 PluginSocketRecv(SysError),
60 #[error("failed to send to plugin request socket: {0}")]
61 PluginSocketSend(SysError),
62 }
63
new_seqpacket_pair() -> SysResult<(UnixDatagram, UnixDatagram)>64 fn new_seqpacket_pair() -> SysResult<(UnixDatagram, UnixDatagram)> {
65 let mut fds = [0, 0];
66 unsafe {
67 let ret = socketpair(AF_UNIX, SOCK_SEQPACKET, 0, fds.as_mut_ptr());
68 if ret == 0 {
69 ioctl(fds[0], FIOCLEX);
70 Ok((
71 UnixDatagram::from_raw_descriptor(fds[0]),
72 UnixDatagram::from_raw_descriptor(fds[1]),
73 ))
74 } else {
75 Err(SysError::last())
76 }
77 }
78 }
79
80 struct VcpuPipe {
81 crosvm_read: File,
82 plugin_write: File,
83 plugin_read: File,
84 crosvm_write: File,
85 }
86
new_pipe_pair() -> SysResult<VcpuPipe>87 fn new_pipe_pair() -> SysResult<VcpuPipe> {
88 let to_crosvm = pipe(true)?;
89 let to_plugin = pipe(true)?;
90 // Increasing the pipe size can be a nice-to-have to make sure that
91 // messages get across atomically (and made sure that writes don't block),
92 // though it's not necessary a hard requirement for things to work.
93 let flags = unsafe {
94 fcntl(
95 to_crosvm.0.as_raw_descriptor(),
96 F_SETPIPE_SZ,
97 MAX_VCPU_DATAGRAM_SIZE as c_int,
98 )
99 };
100 if flags < 0 || flags != MAX_VCPU_DATAGRAM_SIZE as i32 {
101 warn!(
102 "Failed to adjust size of crosvm pipe (result {}): {}",
103 flags,
104 SysError::last()
105 );
106 }
107 let flags = unsafe {
108 fcntl(
109 to_plugin.0.as_raw_descriptor(),
110 F_SETPIPE_SZ,
111 MAX_VCPU_DATAGRAM_SIZE as c_int,
112 )
113 };
114 if flags < 0 || flags != MAX_VCPU_DATAGRAM_SIZE as i32 {
115 warn!(
116 "Failed to adjust size of plugin pipe (result {}): {}",
117 flags,
118 SysError::last()
119 );
120 }
121 Ok(VcpuPipe {
122 crosvm_read: to_crosvm.0,
123 plugin_write: to_crosvm.1,
124 plugin_read: to_plugin.0,
125 crosvm_write: to_plugin.1,
126 })
127 }
128
proto_to_sys_err(e: ProtobufError) -> SysError129 fn proto_to_sys_err(e: ProtobufError) -> SysError {
130 match e {
131 ProtobufError::IoError(e) => SysError::new(e.raw_os_error().unwrap_or(EINVAL)),
132 _ => SysError::new(EINVAL),
133 }
134 }
135
io_to_sys_err(e: io::Error) -> SysError136 fn io_to_sys_err(e: io::Error) -> SysError {
137 SysError::new(e.raw_os_error().unwrap_or(EINVAL))
138 }
139
mmap_to_sys_err(e: MmapError) -> SysError140 fn mmap_to_sys_err(e: MmapError) -> SysError {
141 match e {
142 MmapError::SystemCallFailed(e) => e,
143 _ => SysError::new(EINVAL),
144 }
145 }
146
create_plugin_jail(root: &Path, log_failures: bool, seccomp_policy: &Path) -> Result<Minijail>147 fn create_plugin_jail(root: &Path, log_failures: bool, seccomp_policy: &Path) -> Result<Minijail> {
148 // All child jails run in a new user namespace without any users mapped,
149 // they run as nobody unless otherwise configured.
150 let mut j = Minijail::new().context("failed to create jail")?;
151 j.namespace_pids();
152 j.namespace_user();
153 j.uidmap(&format!("0 {0} 1", geteuid()))
154 .context("failed to set uidmap for jail")?;
155 j.gidmap(&format!("0 {0} 1", getegid()))
156 .context("failed to set gidmap for jail")?;
157 j.namespace_user_disable_setgroups();
158 // Don't need any capabilities.
159 j.use_caps(0);
160 // Create a new mount namespace with an empty root FS.
161 j.namespace_vfs();
162 j.enter_pivot_root(root)
163 .context("failed to set jail pivot root")?;
164 // Run in an empty network namespace.
165 j.namespace_net();
166 j.no_new_privs();
167 // By default we'll prioritize using the pre-compiled .bpf over the .policy
168 // file (the .bpf is expected to be compiled using "trap" as the failure
169 // behavior instead of the default "kill" behavior).
170 // Refer to the code comment for the "seccomp-log-failures"
171 // command-line parameter for an explanation about why the |log_failures|
172 // flag forces the use of .policy files (and the build-time alternative to
173 // this run-time flag).
174 let bpf_policy_file = seccomp_policy.with_extension("bpf");
175 if bpf_policy_file.exists() && !log_failures {
176 j.parse_seccomp_program(&bpf_policy_file)
177 .context("failed to parse jail seccomp BPF program")?;
178 } else {
179 // Use TSYNC only for the side effect of it using SECCOMP_RET_TRAP,
180 // which will correctly kill the entire device process if a worker
181 // thread commits a seccomp violation.
182 j.set_seccomp_filter_tsync();
183 if log_failures {
184 j.log_seccomp_filter_failures();
185 }
186 j.parse_seccomp_filters(&seccomp_policy.with_extension("policy"))
187 .context("failed to parse jail seccomp filter")?;
188 }
189 j.use_seccomp_filter();
190 // Don't do init setup.
191 j.run_as_init();
192
193 // Create a tmpfs in the plugin's root directory so that we can bind mount it's executable
194 // file into it. The size=67108864 is size=64*1024*1024 or size=64MB.
195 j.mount_with_data(
196 Path::new("none"),
197 Path::new("/"),
198 "tmpfs",
199 (MS_NOSUID | MS_NODEV | MS_NOEXEC) as usize,
200 "size=67108864",
201 )
202 .context("failed to mount root")?;
203
204 // Because we requested to "run as init", minijail will not mount /proc for us even though
205 // plugin will be running in its own PID namespace, so we have to mount it ourselves.
206 j.mount(
207 Path::new("proc"),
208 Path::new("/proc"),
209 "proc",
210 (MS_NOSUID | MS_NODEV | MS_NOEXEC | MS_RDONLY) as usize,
211 )
212 .context("failed to mount proc")?;
213
214 Ok(j)
215 }
216
217 /// Each `PluginObject` represents one object that was instantiated by the guest using the `Create`
218 /// request.
219 ///
220 /// Each such object has an ID associated with it that exists in an ID space shared by every variant
221 /// of `PluginObject`. This allows all the objects to be indexed in a single map, and allows for a
222 /// common destroy method.
223 ///
224
225 /// In addition to the destory method, each object may have methods specific to its variant type.
226 /// These variant methods must be done by matching the variant to the expected type for that method.
227 /// For example, getting the dirty log from a `Memory` object starting with an ID:
228 ///
229 /// ```ignore
230 /// match objects.get(&request_id) {
231 /// Some(&PluginObject::Memory { slot, length }) => vm.get_dirty_log(slot, &mut dirty_log[..]),
232 /// _ => return Err(SysError::new(ENOENT)),
233 /// }
234 /// ```
235 enum PluginObject {
236 IoEvent {
237 evt: Event,
238 addr: IoeventAddress,
239 length: u32,
240 datamatch: u64,
241 },
242 Memory {
243 slot: u32,
244 length: usize,
245 },
246 IrqEvent {
247 irq_id: u32,
248 evt: Event,
249 },
250 }
251
252 impl PluginObject {
destroy(self, vm: &mut Vm) -> SysResult<()>253 fn destroy(self, vm: &mut Vm) -> SysResult<()> {
254 match self {
255 PluginObject::IoEvent {
256 evt,
257 addr,
258 length,
259 datamatch,
260 } => match length {
261 0 => vm.unregister_ioevent(&evt, addr, Datamatch::AnyLength),
262 1 => vm.unregister_ioevent(&evt, addr, Datamatch::U8(Some(datamatch as u8))),
263 2 => vm.unregister_ioevent(&evt, addr, Datamatch::U16(Some(datamatch as u16))),
264 4 => vm.unregister_ioevent(&evt, addr, Datamatch::U32(Some(datamatch as u32))),
265 8 => vm.unregister_ioevent(&evt, addr, Datamatch::U64(Some(datamatch as u64))),
266 _ => Err(SysError::new(EINVAL)),
267 },
268 PluginObject::Memory { slot, .. } => vm.remove_memory_region(slot).and(Ok(())),
269 PluginObject::IrqEvent { irq_id, evt } => vm.unregister_irqfd(&evt, irq_id),
270 }
271 }
272 }
273
run_vcpus( kvm: &Kvm, vm: &Vm, plugin: &Process, vcpu_count: u32, kill_signaled: &Arc<AtomicBool>, exit_evt: &Event, vcpu_handles: &mut Vec<thread::JoinHandle<()>>, ) -> Result<()>274 pub fn run_vcpus(
275 kvm: &Kvm,
276 vm: &Vm,
277 plugin: &Process,
278 vcpu_count: u32,
279 kill_signaled: &Arc<AtomicBool>,
280 exit_evt: &Event,
281 vcpu_handles: &mut Vec<thread::JoinHandle<()>>,
282 ) -> Result<()> {
283 let vcpu_thread_barrier = Arc::new(Barrier::new((vcpu_count) as usize));
284 let use_kvm_signals = !kvm.check_extension(Cap::ImmediateExit);
285
286 // If we need to force a vcpu to exit from a VM then a SIGRTMIN signal is sent
287 // to that vcpu's thread. If KVM is running the VM then it'll return -EINTR.
288 // An issue is what to do when KVM isn't running the VM (where we could be
289 // in the kernel or in the app).
290 //
291 // If KVM supports "immediate exit" then we set a signal handler that will
292 // set the |immediate_exit| flag that tells KVM to return -EINTR before running
293 // the VM.
294 //
295 // If KVM doesn't support immediate exit then we'll block SIGRTMIN in the app
296 // and tell KVM to unblock SIGRTMIN before running the VM (at which point a blocked
297 // signal might get asserted). There's overhead to have KVM unblock and re-block
298 // SIGRTMIN each time it runs the VM, so this mode should be avoided.
299
300 if use_kvm_signals {
301 unsafe {
302 extern "C" fn handle_signal(_: c_int) {}
303 // Our signal handler does nothing and is trivially async signal safe.
304 // We need to install this signal handler even though we do block
305 // the signal below, to ensure that this signal will interrupt
306 // execution of KVM_RUN (this is implementation issue).
307 register_rt_signal_handler(SIGRTMIN() + 0, handle_signal)
308 .expect("failed to register vcpu signal handler");
309 }
310 // We do not really want the signal handler to run...
311 block_signal(SIGRTMIN() + 0).expect("failed to block signal");
312 } else {
313 unsafe {
314 extern "C" fn handle_signal(_: c_int) {
315 Vcpu::set_local_immediate_exit(true);
316 }
317 register_rt_signal_handler(SIGRTMIN() + 0, handle_signal)
318 .expect("failed to register vcpu signal handler");
319 }
320 }
321
322 for cpu_id in 0..vcpu_count {
323 let kill_signaled = kill_signaled.clone();
324 let vcpu_thread_barrier = vcpu_thread_barrier.clone();
325 let vcpu_exit_evt = exit_evt.try_clone().context("failed to clone event")?;
326 let vcpu_plugin = plugin.create_vcpu(cpu_id)?;
327 let vcpu = Vcpu::new(cpu_id as c_ulong, kvm, vm).context("error creating vcpu")?;
328
329 vcpu_handles.push(
330 thread::Builder::new()
331 .name(format!("crosvm_vcpu{}", cpu_id))
332 .spawn(move || {
333 if use_kvm_signals {
334 // Tell KVM to not block anything when entering kvm run
335 // because we will be using first RT signal to kick the VCPU.
336 vcpu.set_signal_mask(&[])
337 .expect("failed to set up KVM VCPU signal mask");
338 }
339
340 if let Err(e) = enable_core_scheduling() {
341 error!("Failed to enable core scheduling: {}", e);
342 }
343
344 let vcpu = vcpu
345 .to_runnable(Some(SIGRTMIN() + 0))
346 .expect("Failed to set thread id");
347
348 let res = vcpu_plugin.init(&vcpu);
349 vcpu_thread_barrier.wait();
350 if let Err(e) = res {
351 error!("failed to initialize vcpu {}: {}", cpu_id, e);
352 } else {
353 loop {
354 let mut interrupted_by_signal = false;
355 let run_res = vcpu.run();
356 match run_res {
357 Ok(run) => match run {
358 VcpuExit::IoIn { port, mut size } => {
359 let mut data = [0; 256];
360 if size > data.len() {
361 error!(
362 "unsupported IoIn size of {} bytes at port {:#x}",
363 size, port
364 );
365 size = data.len();
366 }
367 vcpu_plugin.io_read(port as u64, &mut data[..size], &vcpu);
368 if let Err(e) = vcpu.set_data(&data[..size]) {
369 error!(
370 "failed to set return data for IoIn at port {:#x}: {}",
371 port, e
372 );
373 }
374 }
375 VcpuExit::IoOut {
376 port,
377 mut size,
378 data,
379 } => {
380 if size > data.len() {
381 error!("unsupported IoOut size of {} bytes at port {:#x}", size, port);
382 size = data.len();
383 }
384 vcpu_plugin.io_write(port as u64, &data[..size], &vcpu);
385 }
386 VcpuExit::MmioRead { address, size } => {
387 let mut data = [0; 8];
388 vcpu_plugin.mmio_read(
389 address as u64,
390 &mut data[..size],
391 &vcpu,
392 );
393 // Setting data for mmio can not fail.
394 let _ = vcpu.set_data(&data[..size]);
395 }
396 VcpuExit::MmioWrite {
397 address,
398 size,
399 data,
400 } => {
401 vcpu_plugin.mmio_write(
402 address as u64,
403 &data[..size],
404 &vcpu,
405 );
406 }
407 VcpuExit::HypervHcall { input, params } => {
408 let mut data = [0; 8];
409 vcpu_plugin.hyperv_call(input, params, &mut data, &vcpu);
410 // Setting data for hyperv call can not fail.
411 let _ = vcpu.set_data(&data);
412 }
413 VcpuExit::HypervSynic {
414 msr,
415 control,
416 evt_page,
417 msg_page,
418 } => {
419 vcpu_plugin
420 .hyperv_synic(msr, control, evt_page, msg_page, &vcpu);
421 }
422 VcpuExit::Hlt => break,
423 VcpuExit::Shutdown => break,
424 VcpuExit::InternalError => {
425 error!("vcpu {} has internal error", cpu_id);
426 break;
427 }
428 r => warn!("unexpected vcpu exit: {:?}", r),
429 },
430 Err(e) => match e.errno() {
431 EINTR => interrupted_by_signal = true,
432 EAGAIN => {}
433 _ => {
434 error!("vcpu hit unknown error: {}", e);
435 break;
436 }
437 },
438 }
439 if kill_signaled.load(Ordering::SeqCst) {
440 break;
441 }
442
443 // Only handle the pause request if kvm reported that it was
444 // interrupted by a signal. This helps to entire that KVM has had a chance
445 // to finish emulating any IO that may have immediately happened.
446 // If we eagerly check pre_run() then any IO that we
447 // just reported to the plugin won't have been processed yet by KVM.
448 // Not eagerly calling pre_run() also helps to reduce
449 // any overhead from checking if a pause request is pending.
450 // The assumption is that pause requests aren't common
451 // or frequent so it's better to optimize for the non-pause execution paths.
452 if interrupted_by_signal {
453 if use_kvm_signals {
454 clear_signal(SIGRTMIN() + 0)
455 .expect("failed to clear pending signal");
456 } else {
457 vcpu.set_immediate_exit(false);
458 }
459
460 if let Err(e) = vcpu_plugin.pre_run(&vcpu) {
461 error!("failed to process pause on vcpu {}: {}", cpu_id, e);
462 break;
463 }
464 }
465 }
466 }
467 vcpu_exit_evt
468 .write(1)
469 .expect("failed to signal vcpu exit event");
470 })
471 .context("error spawning vcpu thread")?,
472 );
473 }
474 Ok(())
475 }
476
477 #[derive(PollToken)]
478 enum Token {
479 Exit,
480 ChildSignal,
481 Stderr,
482 Plugin { index: usize },
483 }
484
485 /// Run a VM with a plugin process specified by `cfg`.
486 ///
487 /// Not every field of `cfg` will be used. In particular, most field that pertain to a specific
488 /// device are ignored because the plugin is responsible for emulating hardware.
run_config(cfg: Config) -> Result<()>489 pub fn run_config(cfg: Config) -> Result<()> {
490 info!("crosvm starting plugin process");
491
492 // Masking signals is inherently dangerous, since this can persist across clones/execs. Do this
493 // before any jailed devices have been spawned, so that we can catch any of them that fail very
494 // quickly.
495 let sigchld_fd = SignalFd::new(SIGCHLD).context("failed to create signalfd")?;
496
497 // Create a pipe to capture error messages from plugin and minijail.
498 let (mut stderr_rd, stderr_wr) = pipe(true).context("failed to create stderr pipe")?;
499 add_fd_flags(stderr_rd.as_raw_descriptor(), O_NONBLOCK)
500 .context("error marking stderr nonblocking")?;
501
502 #[allow(unused_mut)]
503 let mut env_fds: Vec<(String, Descriptor)> = Vec::default();
504
505 let _default_render_server_params = crate::platform::GpuRenderServerParameters {
506 path: std::path::PathBuf::from("/usr/libexec/virgl_render_server"),
507 cache_path: None,
508 cache_size: None,
509 };
510
511 #[cfg(feature = "gpu")]
512 let gpu_render_server_parameters = if let Some(parameters) = &cfg.gpu_render_server_parameters {
513 Some(parameters)
514 } else {
515 if cfg!(feature = "plugin-render-server") {
516 Some(&_default_render_server_params)
517 } else {
518 None
519 }
520 };
521
522 #[cfg(feature = "gpu")]
523 // Hold on to the render server jail so it keeps running until we exit run_config()
524 let (_render_server_jail, _render_server_fd) =
525 if let Some(parameters) = &gpu_render_server_parameters {
526 let (jail, fd) = crate::platform::gpu::start_gpu_render_server(&cfg, parameters)?;
527 env_fds.push((
528 CROSVM_GPU_SERVER_FD_ENV.to_string(),
529 Descriptor(fd.as_raw_descriptor()),
530 ));
531 (
532 Some(crate::platform::jail_helpers::ScopedMinijail(jail)),
533 Some(fd),
534 )
535 } else {
536 (None, None)
537 };
538
539 let jail = if let Some(jail_config) = &cfg.jail_config {
540 // An empty directory for jailed plugin pivot root.
541 let root_path = match &cfg.plugin_root {
542 Some(dir) => dir,
543 None => Path::new(option_env!("DEFAULT_PIVOT_ROOT").unwrap_or("/var/empty")),
544 };
545
546 if root_path.is_relative() {
547 bail!("path to the root directory must be absolute");
548 }
549
550 if !root_path.exists() {
551 bail!("no root directory for jailed process to pivot root into");
552 }
553
554 if !root_path.is_dir() {
555 bail!("specified root directory is not a directory");
556 }
557
558 let policy_path = jail_config.seccomp_policy_dir.join("plugin");
559 let mut jail =
560 create_plugin_jail(root_path, jail_config.seccomp_log_failures, &policy_path)?;
561
562 // Update gid map of the jail if caller provided supplemental groups.
563 if !cfg.plugin_gid_maps.is_empty() {
564 let map = format!("0 {} 1", getegid())
565 + &cfg
566 .plugin_gid_maps
567 .into_iter()
568 .map(|m| format!(",{} {} {}", m.inner, m.outer, m.count))
569 .collect::<String>();
570 jail.gidmap(&map).context("failed to set gidmap for jail")?;
571 }
572
573 // Mount minimal set of devices (full, zero, urandom, etc). We can not use
574 // jail.mount_dev() here because crosvm may not be running with CAP_SYS_ADMIN.
575 let device_names = ["full", "null", "urandom", "zero"];
576 for name in &device_names {
577 let device = Path::new("/dev").join(&name);
578 jail.mount_bind(&device, &device, true)
579 .context("failed to mount dev")?;
580 }
581
582 for bind_mount in &cfg.plugin_mounts {
583 jail.mount_bind(&bind_mount.src, &bind_mount.dst, bind_mount.writable)
584 .with_context(|| {
585 format!(
586 "failed to bind mount {} -> {} as {} ",
587 bind_mount.src.display(),
588 bind_mount.dst.display(),
589 if bind_mount.writable {
590 "writable"
591 } else {
592 "read only"
593 }
594 )
595 })?;
596 }
597
598 Some(jail)
599 } else {
600 None
601 };
602
603 let mut tap_interfaces: Vec<Tap> = Vec::new();
604 if let Some(host_ip) = cfg.host_ip {
605 if let Some(netmask) = cfg.netmask {
606 if let Some(mac_address) = cfg.mac_address {
607 let tap = Tap::new(false, false).context("error opening tap device")?;
608 tap.set_ip_addr(host_ip).context("error setting tap ip")?;
609 tap.set_netmask(netmask)
610 .context("error setting tap netmask")?;
611 tap.set_mac_address(mac_address)
612 .context("error setting tap mac address")?;
613
614 tap.enable().context("error enabling tap device")?;
615 tap_interfaces.push(tap);
616 }
617 }
618 }
619 for tap_fd in cfg.tap_fd {
620 // Safe because we ensure that we get a unique handle to the fd.
621 let tap = unsafe {
622 Tap::from_raw_descriptor(
623 validate_raw_descriptor(tap_fd).context("failed to validate raw tap fd")?,
624 )
625 .context("failed to create tap device from raw fd")?
626 };
627 tap_interfaces.push(tap);
628 }
629
630 let plugin_args: Vec<&str> = cfg.params.iter().map(|s| &s[..]).collect();
631
632 let plugin_path = match cfg.executable_path {
633 Some(Executable::Plugin(ref plugin_path)) => plugin_path.as_path(),
634 _ => panic!("Executable was not a plugin"),
635 };
636 let vcpu_count = cfg.vcpu_count.unwrap_or(1) as u32;
637 let mem = GuestMemory::new(&[]).unwrap();
638 let mut mem_policy = MemoryPolicy::empty();
639 if cfg.hugepages {
640 mem_policy |= MemoryPolicy::USE_HUGEPAGES;
641 }
642 mem.set_memory_policy(mem_policy);
643 let kvm = Kvm::new_with_path(&cfg.kvm_device_path).context("error creating Kvm")?;
644 let mut vm = Vm::new(&kvm, mem).context("error creating vm")?;
645 vm.create_irq_chip()
646 .context("failed to create kvm irqchip")?;
647 vm.create_pit().context("failed to create kvm PIT")?;
648
649 let mut plugin = Process::new(
650 vcpu_count,
651 plugin_path,
652 &plugin_args,
653 jail,
654 stderr_wr,
655 env_fds,
656 )?;
657 // Now that the jail for the plugin has been created and we had a chance to adjust gids there,
658 // we can drop all our capabilities in case we had any.
659 drop_capabilities().context("failed to drop process capabilities")?;
660
661 let mut res = Ok(());
662 // If Some, we will exit after enough time is passed to shutdown cleanly.
663 let mut dying_instant: Option<Instant> = None;
664 let duration_to_die = Duration::from_millis(1000);
665
666 let exit_evt = Event::new().context("failed to create event")?;
667 let kill_signaled = Arc::new(AtomicBool::new(false));
668 let mut vcpu_handles = Vec::with_capacity(vcpu_count as usize);
669
670 let wait_ctx = WaitContext::build_with(&[
671 (&exit_evt, Token::Exit),
672 (&sigchld_fd, Token::ChildSignal),
673 (&stderr_rd, Token::Stderr),
674 ])
675 .context("failed to add control descriptors to wait context")?;
676
677 let mut sockets_to_drop = Vec::new();
678 let mut redo_wait_ctx_sockets = true;
679 // In this loop, make every attempt to not return early. If an error is encountered, set `res`
680 // to the error, set `dying_instant` to now, and signal the plugin that it will be killed soon.
681 // If the plugin cannot be signaled because it is dead of `signal_kill` failed, simply break
682 // from the poll loop so that the VCPU threads can be cleaned up.
683 'wait: loop {
684 // After we have waited long enough, it's time to give up and exit.
685 if dying_instant
686 .map(|i| i.elapsed() >= duration_to_die)
687 .unwrap_or(false)
688 {
689 break;
690 }
691
692 if redo_wait_ctx_sockets {
693 for (index, socket) in plugin.sockets().iter().enumerate() {
694 wait_ctx
695 .add(socket, Token::Plugin { index })
696 .context("failed to add plugin sockets to wait context")?;
697 }
698 }
699
700 let plugin_socket_count = plugin.sockets().len();
701 let events = {
702 let poll_res = match dying_instant {
703 Some(inst) => wait_ctx.wait_timeout(duration_to_die - inst.elapsed()),
704 None => wait_ctx.wait(),
705 };
706 match poll_res {
707 Ok(v) => v,
708 Err(e) => {
709 // Polling no longer works, time to break and cleanup,
710 if res.is_ok() {
711 res = Err(e).context("failed to poll all FDs");
712 }
713 break;
714 }
715 }
716 };
717
718 for event in events.iter().filter(|e| e.is_hungup) {
719 if let Token::Stderr = event.token {
720 let _ = wait_ctx.delete(&stderr_rd);
721 }
722 }
723
724 for event in events.iter().filter(|e| e.is_readable) {
725 match event.token {
726 Token::Exit => {
727 // No need to check the exit event if we are already doing cleanup.
728 let _ = wait_ctx.delete(&exit_evt);
729 dying_instant.get_or_insert(Instant::now());
730 let sig_res = plugin.signal_kill();
731 if res.is_ok() && sig_res.is_err() {
732 res = sig_res.context("error sending kill signal to plugin on exit event");
733 }
734 }
735 Token::ChildSignal => {
736 // Print all available siginfo structs, then exit the loop.
737 loop {
738 match sigchld_fd.read() {
739 Ok(Some(siginfo)) => {
740 // If the plugin process has ended, there is no need to continue
741 // processing plugin connections, so we break early.
742 if siginfo.ssi_pid == plugin.pid() as u32 {
743 break 'wait;
744 }
745 // Because SIGCHLD is not expected from anything other than the
746 // plugin process, report it as an error.
747 if res.is_ok() {
748 res = Err(anyhow!(
749 "process {} died with signal {}, status {}, and code {}",
750 siginfo.ssi_pid,
751 siginfo.ssi_signo,
752 siginfo.ssi_status,
753 siginfo.ssi_code,
754 ));
755 }
756 }
757 Ok(None) => break, // No more signals to read.
758 Err(e) => {
759 // Something really must be messed up for this to happen, continue
760 // processing connections for a limited time.
761 if res.is_ok() {
762 res = Err(e).context("failed to read signal fd");
763 }
764 break;
765 }
766 }
767 }
768 // As we only spawn the plugin process, getting a SIGCHLD can only mean
769 // something went wrong.
770 dying_instant.get_or_insert(Instant::now());
771 let sig_res = plugin.signal_kill();
772 if res.is_ok() && sig_res.is_err() {
773 res = sig_res.context("error sending kill signal to plugin on SIGCHLD");
774 }
775 }
776 Token::Stderr => loop {
777 let mut buf = [0u8; 4096];
778 match stderr_rd.read(&mut buf) {
779 Ok(len) => {
780 for l in String::from_utf8_lossy(&buf[0..len]).lines() {
781 error!("minijail/plugin: {}", l);
782 }
783 }
784 Err(ref e) if e.kind() == io::ErrorKind::WouldBlock => {
785 break;
786 }
787 Err(e) => {
788 error!("failed reading from stderr: {}", e);
789 break;
790 }
791 }
792 },
793 Token::Plugin { index } => {
794 match plugin.handle_socket(index, &kvm, &mut vm, &vcpu_handles, &tap_interfaces)
795 {
796 Ok(_) => {}
797 // A HUP is an expected event for a socket, so don't bother warning about
798 // it.
799 Err(CommError::PluginSocketHup) => sockets_to_drop.push(index),
800 // Only one connection out of potentially many is broken. Drop it, but don't
801 // start cleaning up. Because the error isn't returned, we will warn about
802 // it here.
803 Err(e) => {
804 warn!("error handling plugin socket: {}", e);
805 sockets_to_drop.push(index);
806 }
807 }
808 }
809 }
810 }
811
812 if vcpu_handles.is_empty() && dying_instant.is_none() && plugin.is_started() {
813 let res = run_vcpus(
814 &kvm,
815 &vm,
816 &plugin,
817 vcpu_count,
818 &kill_signaled,
819 &exit_evt,
820 &mut vcpu_handles,
821 );
822 if let Err(e) = res {
823 dying_instant.get_or_insert(Instant::now());
824 error!("failed to start vcpus: {}", e);
825 }
826 }
827
828 redo_wait_ctx_sockets =
829 !sockets_to_drop.is_empty() || plugin.sockets().len() != plugin_socket_count;
830
831 // Cleanup all of the sockets that we have determined were disconnected or suffered some
832 // other error.
833 plugin.drop_sockets(&mut sockets_to_drop);
834 sockets_to_drop.clear();
835
836 if redo_wait_ctx_sockets {
837 for socket in plugin.sockets() {
838 let _ = wait_ctx.delete(socket);
839 }
840 }
841 }
842
843 // vcpu threads MUST see the kill signaled flag, otherwise they may re-enter the VM.
844 kill_signaled.store(true, Ordering::SeqCst);
845 // Depending on how we ended up here, the plugin process, or a VCPU thread waiting for requests
846 // might be stuck. The `signal_kill` call will unstick all the VCPU threads by closing their
847 // blocked connections.
848 plugin
849 .signal_kill()
850 .context("error sending kill signal to plugin on cleanup")?;
851 for handle in vcpu_handles {
852 match handle.kill(SIGRTMIN() + 0) {
853 Ok(_) => {
854 if let Err(e) = handle.join() {
855 error!("failed to join vcpu thread: {:?}", e);
856 }
857 }
858 Err(e) => error!("failed to kill vcpu thread: {}", e),
859 }
860 }
861
862 match plugin.try_wait() {
863 // The plugin has run out of time by now
864 Ok(ProcessStatus::Running) => Err(anyhow!("plugin did not exit within timeout")),
865 // Return an error discovered earlier in this function.
866 Ok(ProcessStatus::Success) => res.map_err(anyhow::Error::msg),
867 Ok(ProcessStatus::Fail(code)) => Err(anyhow!("plugin exited with error: {}", code)),
868 Ok(ProcessStatus::Signal(code)) => Err(anyhow!("plugin exited with signal {}", code)),
869 Err(e) => Err(anyhow!("error waiting for plugin to exit: {}", e)),
870 }
871 }
872