• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2018 The Chromium OS Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 mod process;
6 mod vcpu;
7 
8 use std::fs::File;
9 use std::io;
10 use std::io::Read;
11 use std::os::unix::net::UnixDatagram;
12 use std::path::Path;
13 use std::sync::atomic::{AtomicBool, Ordering};
14 use std::sync::{Arc, Barrier};
15 use std::thread;
16 use std::time::{Duration, Instant};
17 
18 use libc::{
19     c_int, c_ulong, fcntl, ioctl, socketpair, AF_UNIX, EAGAIN, EBADF, EDEADLK, EEXIST, EINTR,
20     EINVAL, ENOENT, EOVERFLOW, EPERM, FIOCLEX, F_SETPIPE_SZ, MS_NODEV, MS_NOEXEC, MS_NOSUID,
21     MS_RDONLY, O_NONBLOCK, SIGCHLD, SOCK_SEQPACKET,
22 };
23 
24 use anyhow::{anyhow, bail, Context, Result};
25 use protobuf::ProtobufError;
26 use remain::sorted;
27 use thiserror::Error;
28 
29 use base::{
30     add_fd_flags, block_signal, clear_signal, drop_capabilities, enable_core_scheduling, error,
31     getegid, geteuid, info, pipe, register_rt_signal_handler, validate_raw_descriptor, warn,
32     AsRawDescriptor, Descriptor, Error as SysError, Event, FromRawDescriptor, Killable, MmapError,
33     PollToken, RawDescriptor, Result as SysResult, SignalFd, WaitContext, SIGRTMIN,
34 };
35 use kvm::{Cap, Datamatch, IoeventAddress, Kvm, Vcpu, VcpuExit, Vm};
36 use minijail::{self, Minijail};
37 use net_util::{Tap, TapT};
38 use vm_memory::{GuestMemory, MemoryPolicy};
39 
40 use self::process::*;
41 use self::vcpu::*;
42 use crate::{Config, Executable};
43 
44 const MAX_DATAGRAM_SIZE: usize = 4096;
45 const MAX_VCPU_DATAGRAM_SIZE: usize = 0x40000;
46 const CROSVM_GPU_SERVER_FD_ENV: &str = "CROSVM_GPU_SERVER_FD";
47 
48 /// An error that occurs when communicating with the plugin process.
49 #[sorted]
50 #[derive(Error, Debug)]
51 pub enum CommError {
52     #[error("failed to decode plugin request: {0}")]
53     DecodeRequest(ProtobufError),
54     #[error("failed to encode plugin response: {0}")]
55     EncodeResponse(ProtobufError),
56     #[error("plugin request socket has been hung up")]
57     PluginSocketHup,
58     #[error("failed to recv from plugin request socket: {0}")]
59     PluginSocketRecv(SysError),
60     #[error("failed to send to plugin request socket: {0}")]
61     PluginSocketSend(SysError),
62 }
63 
new_seqpacket_pair() -> SysResult<(UnixDatagram, UnixDatagram)>64 fn new_seqpacket_pair() -> SysResult<(UnixDatagram, UnixDatagram)> {
65     let mut fds = [0, 0];
66     unsafe {
67         let ret = socketpair(AF_UNIX, SOCK_SEQPACKET, 0, fds.as_mut_ptr());
68         if ret == 0 {
69             ioctl(fds[0], FIOCLEX);
70             Ok((
71                 UnixDatagram::from_raw_descriptor(fds[0]),
72                 UnixDatagram::from_raw_descriptor(fds[1]),
73             ))
74         } else {
75             Err(SysError::last())
76         }
77     }
78 }
79 
80 struct VcpuPipe {
81     crosvm_read: File,
82     plugin_write: File,
83     plugin_read: File,
84     crosvm_write: File,
85 }
86 
new_pipe_pair() -> SysResult<VcpuPipe>87 fn new_pipe_pair() -> SysResult<VcpuPipe> {
88     let to_crosvm = pipe(true)?;
89     let to_plugin = pipe(true)?;
90     // Increasing the pipe size can be a nice-to-have to make sure that
91     // messages get across atomically (and made sure that writes don't block),
92     // though it's not necessary a hard requirement for things to work.
93     let flags = unsafe {
94         fcntl(
95             to_crosvm.0.as_raw_descriptor(),
96             F_SETPIPE_SZ,
97             MAX_VCPU_DATAGRAM_SIZE as c_int,
98         )
99     };
100     if flags < 0 || flags != MAX_VCPU_DATAGRAM_SIZE as i32 {
101         warn!(
102             "Failed to adjust size of crosvm pipe (result {}): {}",
103             flags,
104             SysError::last()
105         );
106     }
107     let flags = unsafe {
108         fcntl(
109             to_plugin.0.as_raw_descriptor(),
110             F_SETPIPE_SZ,
111             MAX_VCPU_DATAGRAM_SIZE as c_int,
112         )
113     };
114     if flags < 0 || flags != MAX_VCPU_DATAGRAM_SIZE as i32 {
115         warn!(
116             "Failed to adjust size of plugin pipe (result {}): {}",
117             flags,
118             SysError::last()
119         );
120     }
121     Ok(VcpuPipe {
122         crosvm_read: to_crosvm.0,
123         plugin_write: to_crosvm.1,
124         plugin_read: to_plugin.0,
125         crosvm_write: to_plugin.1,
126     })
127 }
128 
proto_to_sys_err(e: ProtobufError) -> SysError129 fn proto_to_sys_err(e: ProtobufError) -> SysError {
130     match e {
131         ProtobufError::IoError(e) => SysError::new(e.raw_os_error().unwrap_or(EINVAL)),
132         _ => SysError::new(EINVAL),
133     }
134 }
135 
io_to_sys_err(e: io::Error) -> SysError136 fn io_to_sys_err(e: io::Error) -> SysError {
137     SysError::new(e.raw_os_error().unwrap_or(EINVAL))
138 }
139 
mmap_to_sys_err(e: MmapError) -> SysError140 fn mmap_to_sys_err(e: MmapError) -> SysError {
141     match e {
142         MmapError::SystemCallFailed(e) => e,
143         _ => SysError::new(EINVAL),
144     }
145 }
146 
create_plugin_jail(root: &Path, log_failures: bool, seccomp_policy: &Path) -> Result<Minijail>147 fn create_plugin_jail(root: &Path, log_failures: bool, seccomp_policy: &Path) -> Result<Minijail> {
148     // All child jails run in a new user namespace without any users mapped,
149     // they run as nobody unless otherwise configured.
150     let mut j = Minijail::new().context("failed to create jail")?;
151     j.namespace_pids();
152     j.namespace_user();
153     j.uidmap(&format!("0 {0} 1", geteuid()))
154         .context("failed to set uidmap for jail")?;
155     j.gidmap(&format!("0 {0} 1", getegid()))
156         .context("failed to set gidmap for jail")?;
157     j.namespace_user_disable_setgroups();
158     // Don't need any capabilities.
159     j.use_caps(0);
160     // Create a new mount namespace with an empty root FS.
161     j.namespace_vfs();
162     j.enter_pivot_root(root)
163         .context("failed to set jail pivot root")?;
164     // Run in an empty network namespace.
165     j.namespace_net();
166     j.no_new_privs();
167     // By default we'll prioritize using the pre-compiled .bpf over the .policy
168     // file (the .bpf is expected to be compiled using "trap" as the failure
169     // behavior instead of the default "kill" behavior).
170     // Refer to the code comment for the "seccomp-log-failures"
171     // command-line parameter for an explanation about why the |log_failures|
172     // flag forces the use of .policy files (and the build-time alternative to
173     // this run-time flag).
174     let bpf_policy_file = seccomp_policy.with_extension("bpf");
175     if bpf_policy_file.exists() && !log_failures {
176         j.parse_seccomp_program(&bpf_policy_file)
177             .context("failed to parse jail seccomp BPF program")?;
178     } else {
179         // Use TSYNC only for the side effect of it using SECCOMP_RET_TRAP,
180         // which will correctly kill the entire device process if a worker
181         // thread commits a seccomp violation.
182         j.set_seccomp_filter_tsync();
183         if log_failures {
184             j.log_seccomp_filter_failures();
185         }
186         j.parse_seccomp_filters(&seccomp_policy.with_extension("policy"))
187             .context("failed to parse jail seccomp filter")?;
188     }
189     j.use_seccomp_filter();
190     // Don't do init setup.
191     j.run_as_init();
192 
193     // Create a tmpfs in the plugin's root directory so that we can bind mount it's executable
194     // file into it.  The size=67108864 is size=64*1024*1024 or size=64MB.
195     j.mount_with_data(
196         Path::new("none"),
197         Path::new("/"),
198         "tmpfs",
199         (MS_NOSUID | MS_NODEV | MS_NOEXEC) as usize,
200         "size=67108864",
201     )
202     .context("failed to mount root")?;
203 
204     // Because we requested to "run as init", minijail will not mount /proc for us even though
205     // plugin will be running in its own PID namespace, so we have to mount it ourselves.
206     j.mount(
207         Path::new("proc"),
208         Path::new("/proc"),
209         "proc",
210         (MS_NOSUID | MS_NODEV | MS_NOEXEC | MS_RDONLY) as usize,
211     )
212     .context("failed to mount proc")?;
213 
214     Ok(j)
215 }
216 
217 /// Each `PluginObject` represents one object that was instantiated by the guest using the `Create`
218 /// request.
219 ///
220 /// Each such object has an ID associated with it that exists in an ID space shared by every variant
221 /// of `PluginObject`. This allows all the objects to be indexed in a single map, and allows for a
222 /// common destroy method.
223 ///
224 
225 /// In addition to the destory method, each object may have methods specific to its variant type.
226 /// These variant methods must be done by matching the variant to the expected type for that method.
227 /// For example, getting the dirty log from a `Memory` object starting with an ID:
228 ///
229 /// ```ignore
230 /// match objects.get(&request_id) {
231 ///    Some(&PluginObject::Memory { slot, length }) => vm.get_dirty_log(slot, &mut dirty_log[..]),
232 ///    _ => return Err(SysError::new(ENOENT)),
233 /// }
234 /// ```
235 enum PluginObject {
236     IoEvent {
237         evt: Event,
238         addr: IoeventAddress,
239         length: u32,
240         datamatch: u64,
241     },
242     Memory {
243         slot: u32,
244         length: usize,
245     },
246     IrqEvent {
247         irq_id: u32,
248         evt: Event,
249     },
250 }
251 
252 impl PluginObject {
destroy(self, vm: &mut Vm) -> SysResult<()>253     fn destroy(self, vm: &mut Vm) -> SysResult<()> {
254         match self {
255             PluginObject::IoEvent {
256                 evt,
257                 addr,
258                 length,
259                 datamatch,
260             } => match length {
261                 0 => vm.unregister_ioevent(&evt, addr, Datamatch::AnyLength),
262                 1 => vm.unregister_ioevent(&evt, addr, Datamatch::U8(Some(datamatch as u8))),
263                 2 => vm.unregister_ioevent(&evt, addr, Datamatch::U16(Some(datamatch as u16))),
264                 4 => vm.unregister_ioevent(&evt, addr, Datamatch::U32(Some(datamatch as u32))),
265                 8 => vm.unregister_ioevent(&evt, addr, Datamatch::U64(Some(datamatch as u64))),
266                 _ => Err(SysError::new(EINVAL)),
267             },
268             PluginObject::Memory { slot, .. } => vm.remove_memory_region(slot).and(Ok(())),
269             PluginObject::IrqEvent { irq_id, evt } => vm.unregister_irqfd(&evt, irq_id),
270         }
271     }
272 }
273 
run_vcpus( kvm: &Kvm, vm: &Vm, plugin: &Process, vcpu_count: u32, kill_signaled: &Arc<AtomicBool>, exit_evt: &Event, vcpu_handles: &mut Vec<thread::JoinHandle<()>>, ) -> Result<()>274 pub fn run_vcpus(
275     kvm: &Kvm,
276     vm: &Vm,
277     plugin: &Process,
278     vcpu_count: u32,
279     kill_signaled: &Arc<AtomicBool>,
280     exit_evt: &Event,
281     vcpu_handles: &mut Vec<thread::JoinHandle<()>>,
282 ) -> Result<()> {
283     let vcpu_thread_barrier = Arc::new(Barrier::new((vcpu_count) as usize));
284     let use_kvm_signals = !kvm.check_extension(Cap::ImmediateExit);
285 
286     // If we need to force a vcpu to exit from a VM then a SIGRTMIN signal is sent
287     // to that vcpu's thread.  If KVM is running the VM then it'll return -EINTR.
288     // An issue is what to do when KVM isn't running the VM (where we could be
289     // in the kernel or in the app).
290     //
291     // If KVM supports "immediate exit" then we set a signal handler that will
292     // set the |immediate_exit| flag that tells KVM to return -EINTR before running
293     // the VM.
294     //
295     // If KVM doesn't support immediate exit then we'll block SIGRTMIN in the app
296     // and tell KVM to unblock SIGRTMIN before running the VM (at which point a blocked
297     // signal might get asserted).  There's overhead to have KVM unblock and re-block
298     // SIGRTMIN each time it runs the VM, so this mode should be avoided.
299 
300     if use_kvm_signals {
301         unsafe {
302             extern "C" fn handle_signal(_: c_int) {}
303             // Our signal handler does nothing and is trivially async signal safe.
304             // We need to install this signal handler even though we do block
305             // the signal below, to ensure that this signal will interrupt
306             // execution of KVM_RUN (this is implementation issue).
307             register_rt_signal_handler(SIGRTMIN() + 0, handle_signal)
308                 .expect("failed to register vcpu signal handler");
309         }
310         // We do not really want the signal handler to run...
311         block_signal(SIGRTMIN() + 0).expect("failed to block signal");
312     } else {
313         unsafe {
314             extern "C" fn handle_signal(_: c_int) {
315                 Vcpu::set_local_immediate_exit(true);
316             }
317             register_rt_signal_handler(SIGRTMIN() + 0, handle_signal)
318                 .expect("failed to register vcpu signal handler");
319         }
320     }
321 
322     for cpu_id in 0..vcpu_count {
323         let kill_signaled = kill_signaled.clone();
324         let vcpu_thread_barrier = vcpu_thread_barrier.clone();
325         let vcpu_exit_evt = exit_evt.try_clone().context("failed to clone event")?;
326         let vcpu_plugin = plugin.create_vcpu(cpu_id)?;
327         let vcpu = Vcpu::new(cpu_id as c_ulong, kvm, vm).context("error creating vcpu")?;
328 
329         vcpu_handles.push(
330             thread::Builder::new()
331                 .name(format!("crosvm_vcpu{}", cpu_id))
332                 .spawn(move || {
333                     if use_kvm_signals {
334                         // Tell KVM to not block anything when entering kvm run
335                         // because we will be using first RT signal to kick the VCPU.
336                         vcpu.set_signal_mask(&[])
337                             .expect("failed to set up KVM VCPU signal mask");
338                     }
339 
340                     if let Err(e) = enable_core_scheduling() {
341                         error!("Failed to enable core scheduling: {}", e);
342                     }
343 
344                     let vcpu = vcpu
345                         .to_runnable(Some(SIGRTMIN() + 0))
346                         .expect("Failed to set thread id");
347 
348                     let res = vcpu_plugin.init(&vcpu);
349                     vcpu_thread_barrier.wait();
350                     if let Err(e) = res {
351                         error!("failed to initialize vcpu {}: {}", cpu_id, e);
352                     } else {
353                         loop {
354                             let mut interrupted_by_signal = false;
355                             let run_res = vcpu.run();
356                             match run_res {
357                                 Ok(run) => match run {
358                                     VcpuExit::IoIn { port, mut size } => {
359                                         let mut data = [0; 256];
360                                         if size > data.len() {
361                                             error!(
362                                                 "unsupported IoIn size of {} bytes at port {:#x}",
363                                                 size, port
364                                             );
365                                             size = data.len();
366                                         }
367                                         vcpu_plugin.io_read(port as u64, &mut data[..size], &vcpu);
368                                         if let Err(e) = vcpu.set_data(&data[..size]) {
369                                             error!(
370                                                 "failed to set return data for IoIn at port {:#x}: {}",
371                                                 port, e
372                                             );
373                                         }
374                                     }
375                                     VcpuExit::IoOut {
376                                         port,
377                                         mut size,
378                                         data,
379                                     } => {
380                                         if size > data.len() {
381                                             error!("unsupported IoOut size of {} bytes at port {:#x}", size, port);
382                                             size = data.len();
383                                         }
384                                         vcpu_plugin.io_write(port as u64, &data[..size], &vcpu);
385                                     }
386                                     VcpuExit::MmioRead { address, size } => {
387                                         let mut data = [0; 8];
388                                         vcpu_plugin.mmio_read(
389                                             address as u64,
390                                             &mut data[..size],
391                                             &vcpu,
392                                         );
393                                         // Setting data for mmio can not fail.
394                                         let _ = vcpu.set_data(&data[..size]);
395                                     }
396                                     VcpuExit::MmioWrite {
397                                         address,
398                                         size,
399                                         data,
400                                     } => {
401                                         vcpu_plugin.mmio_write(
402                                             address as u64,
403                                             &data[..size],
404                                             &vcpu,
405                                         );
406                                     }
407                                     VcpuExit::HypervHcall { input, params } => {
408                                         let mut data = [0; 8];
409                                         vcpu_plugin.hyperv_call(input, params, &mut data, &vcpu);
410                                         // Setting data for hyperv call can not fail.
411                                         let _ = vcpu.set_data(&data);
412                                     }
413                                     VcpuExit::HypervSynic {
414                                         msr,
415                                         control,
416                                         evt_page,
417                                         msg_page,
418                                     } => {
419                                         vcpu_plugin
420                                             .hyperv_synic(msr, control, evt_page, msg_page, &vcpu);
421                                     }
422                                     VcpuExit::Hlt => break,
423                                     VcpuExit::Shutdown => break,
424                                     VcpuExit::InternalError => {
425                                         error!("vcpu {} has internal error", cpu_id);
426                                         break;
427                                     }
428                                     r => warn!("unexpected vcpu exit: {:?}", r),
429                                 },
430                                 Err(e) => match e.errno() {
431                                     EINTR => interrupted_by_signal = true,
432                                     EAGAIN => {}
433                                     _ => {
434                                         error!("vcpu hit unknown error: {}", e);
435                                         break;
436                                     }
437                                 },
438                             }
439                             if kill_signaled.load(Ordering::SeqCst) {
440                                 break;
441                             }
442 
443                             // Only handle the pause request if kvm reported that it was
444                             // interrupted by a signal.  This helps to entire that KVM has had a chance
445                             // to finish emulating any IO that may have immediately happened.
446                             // If we eagerly check pre_run() then any IO that we
447                             // just reported to the plugin won't have been processed yet by KVM.
448                             // Not eagerly calling pre_run() also helps to reduce
449                             // any overhead from checking if a pause request is pending.
450                             // The assumption is that pause requests aren't common
451                             // or frequent so it's better to optimize for the non-pause execution paths.
452                             if interrupted_by_signal {
453                                 if use_kvm_signals {
454                                     clear_signal(SIGRTMIN() + 0)
455                                         .expect("failed to clear pending signal");
456                                 } else {
457                                     vcpu.set_immediate_exit(false);
458                                 }
459 
460                                 if let Err(e) = vcpu_plugin.pre_run(&vcpu) {
461                                     error!("failed to process pause on vcpu {}: {}", cpu_id, e);
462                                     break;
463                                 }
464                             }
465                         }
466                     }
467                     vcpu_exit_evt
468                         .write(1)
469                         .expect("failed to signal vcpu exit event");
470                 })
471                 .context("error spawning vcpu thread")?,
472         );
473     }
474     Ok(())
475 }
476 
477 #[derive(PollToken)]
478 enum Token {
479     Exit,
480     ChildSignal,
481     Stderr,
482     Plugin { index: usize },
483 }
484 
485 /// Run a VM with a plugin process specified by `cfg`.
486 ///
487 /// Not every field of `cfg` will be used. In particular, most field that pertain to a specific
488 /// device are ignored because the plugin is responsible for emulating hardware.
run_config(cfg: Config) -> Result<()>489 pub fn run_config(cfg: Config) -> Result<()> {
490     info!("crosvm starting plugin process");
491 
492     // Masking signals is inherently dangerous, since this can persist across clones/execs. Do this
493     // before any jailed devices have been spawned, so that we can catch any of them that fail very
494     // quickly.
495     let sigchld_fd = SignalFd::new(SIGCHLD).context("failed to create signalfd")?;
496 
497     // Create a pipe to capture error messages from plugin and minijail.
498     let (mut stderr_rd, stderr_wr) = pipe(true).context("failed to create stderr pipe")?;
499     add_fd_flags(stderr_rd.as_raw_descriptor(), O_NONBLOCK)
500         .context("error marking stderr nonblocking")?;
501 
502     #[allow(unused_mut)]
503     let mut env_fds: Vec<(String, Descriptor)> = Vec::default();
504 
505     let _default_render_server_params = crate::platform::GpuRenderServerParameters {
506         path: std::path::PathBuf::from("/usr/libexec/virgl_render_server"),
507         cache_path: None,
508         cache_size: None,
509     };
510 
511     #[cfg(feature = "gpu")]
512     let gpu_render_server_parameters = if let Some(parameters) = &cfg.gpu_render_server_parameters {
513         Some(parameters)
514     } else {
515         if cfg!(feature = "plugin-render-server") {
516             Some(&_default_render_server_params)
517         } else {
518             None
519         }
520     };
521 
522     #[cfg(feature = "gpu")]
523     // Hold on to the render server jail so it keeps running until we exit run_config()
524     let (_render_server_jail, _render_server_fd) =
525         if let Some(parameters) = &gpu_render_server_parameters {
526             let (jail, fd) = crate::platform::gpu::start_gpu_render_server(&cfg, parameters)?;
527             env_fds.push((
528                 CROSVM_GPU_SERVER_FD_ENV.to_string(),
529                 Descriptor(fd.as_raw_descriptor()),
530             ));
531             (
532                 Some(crate::platform::jail_helpers::ScopedMinijail(jail)),
533                 Some(fd),
534             )
535         } else {
536             (None, None)
537         };
538 
539     let jail = if let Some(jail_config) = &cfg.jail_config {
540         // An empty directory for jailed plugin pivot root.
541         let root_path = match &cfg.plugin_root {
542             Some(dir) => dir,
543             None => Path::new(option_env!("DEFAULT_PIVOT_ROOT").unwrap_or("/var/empty")),
544         };
545 
546         if root_path.is_relative() {
547             bail!("path to the root directory must be absolute");
548         }
549 
550         if !root_path.exists() {
551             bail!("no root directory for jailed process to pivot root into");
552         }
553 
554         if !root_path.is_dir() {
555             bail!("specified root directory is not a directory");
556         }
557 
558         let policy_path = jail_config.seccomp_policy_dir.join("plugin");
559         let mut jail =
560             create_plugin_jail(root_path, jail_config.seccomp_log_failures, &policy_path)?;
561 
562         // Update gid map of the jail if caller provided supplemental groups.
563         if !cfg.plugin_gid_maps.is_empty() {
564             let map = format!("0 {} 1", getegid())
565                 + &cfg
566                     .plugin_gid_maps
567                     .into_iter()
568                     .map(|m| format!(",{} {} {}", m.inner, m.outer, m.count))
569                     .collect::<String>();
570             jail.gidmap(&map).context("failed to set gidmap for jail")?;
571         }
572 
573         // Mount minimal set of devices (full, zero, urandom, etc). We can not use
574         // jail.mount_dev() here because crosvm may not be running with CAP_SYS_ADMIN.
575         let device_names = ["full", "null", "urandom", "zero"];
576         for name in &device_names {
577             let device = Path::new("/dev").join(&name);
578             jail.mount_bind(&device, &device, true)
579                 .context("failed to mount dev")?;
580         }
581 
582         for bind_mount in &cfg.plugin_mounts {
583             jail.mount_bind(&bind_mount.src, &bind_mount.dst, bind_mount.writable)
584                 .with_context(|| {
585                     format!(
586                         "failed to bind mount {} -> {} as {} ",
587                         bind_mount.src.display(),
588                         bind_mount.dst.display(),
589                         if bind_mount.writable {
590                             "writable"
591                         } else {
592                             "read only"
593                         }
594                     )
595                 })?;
596         }
597 
598         Some(jail)
599     } else {
600         None
601     };
602 
603     let mut tap_interfaces: Vec<Tap> = Vec::new();
604     if let Some(host_ip) = cfg.host_ip {
605         if let Some(netmask) = cfg.netmask {
606             if let Some(mac_address) = cfg.mac_address {
607                 let tap = Tap::new(false, false).context("error opening tap device")?;
608                 tap.set_ip_addr(host_ip).context("error setting tap ip")?;
609                 tap.set_netmask(netmask)
610                     .context("error setting tap netmask")?;
611                 tap.set_mac_address(mac_address)
612                     .context("error setting tap mac address")?;
613 
614                 tap.enable().context("error enabling tap device")?;
615                 tap_interfaces.push(tap);
616             }
617         }
618     }
619     for tap_fd in cfg.tap_fd {
620         // Safe because we ensure that we get a unique handle to the fd.
621         let tap = unsafe {
622             Tap::from_raw_descriptor(
623                 validate_raw_descriptor(tap_fd).context("failed to validate raw tap fd")?,
624             )
625             .context("failed to create tap device from raw fd")?
626         };
627         tap_interfaces.push(tap);
628     }
629 
630     let plugin_args: Vec<&str> = cfg.params.iter().map(|s| &s[..]).collect();
631 
632     let plugin_path = match cfg.executable_path {
633         Some(Executable::Plugin(ref plugin_path)) => plugin_path.as_path(),
634         _ => panic!("Executable was not a plugin"),
635     };
636     let vcpu_count = cfg.vcpu_count.unwrap_or(1) as u32;
637     let mem = GuestMemory::new(&[]).unwrap();
638     let mut mem_policy = MemoryPolicy::empty();
639     if cfg.hugepages {
640         mem_policy |= MemoryPolicy::USE_HUGEPAGES;
641     }
642     mem.set_memory_policy(mem_policy);
643     let kvm = Kvm::new_with_path(&cfg.kvm_device_path).context("error creating Kvm")?;
644     let mut vm = Vm::new(&kvm, mem).context("error creating vm")?;
645     vm.create_irq_chip()
646         .context("failed to create kvm irqchip")?;
647     vm.create_pit().context("failed to create kvm PIT")?;
648 
649     let mut plugin = Process::new(
650         vcpu_count,
651         plugin_path,
652         &plugin_args,
653         jail,
654         stderr_wr,
655         env_fds,
656     )?;
657     // Now that the jail for the plugin has been created and we had a chance to adjust gids there,
658     // we can drop all our capabilities in case we had any.
659     drop_capabilities().context("failed to drop process capabilities")?;
660 
661     let mut res = Ok(());
662     // If Some, we will exit after enough time is passed to shutdown cleanly.
663     let mut dying_instant: Option<Instant> = None;
664     let duration_to_die = Duration::from_millis(1000);
665 
666     let exit_evt = Event::new().context("failed to create event")?;
667     let kill_signaled = Arc::new(AtomicBool::new(false));
668     let mut vcpu_handles = Vec::with_capacity(vcpu_count as usize);
669 
670     let wait_ctx = WaitContext::build_with(&[
671         (&exit_evt, Token::Exit),
672         (&sigchld_fd, Token::ChildSignal),
673         (&stderr_rd, Token::Stderr),
674     ])
675     .context("failed to add control descriptors to wait context")?;
676 
677     let mut sockets_to_drop = Vec::new();
678     let mut redo_wait_ctx_sockets = true;
679     // In this loop, make every attempt to not return early. If an error is encountered, set `res`
680     // to the error, set `dying_instant` to now, and signal the plugin that it will be killed soon.
681     // If the plugin cannot be signaled because it is dead of `signal_kill` failed, simply break
682     // from the poll loop so that the VCPU threads can be cleaned up.
683     'wait: loop {
684         // After we have waited long enough, it's time to give up and exit.
685         if dying_instant
686             .map(|i| i.elapsed() >= duration_to_die)
687             .unwrap_or(false)
688         {
689             break;
690         }
691 
692         if redo_wait_ctx_sockets {
693             for (index, socket) in plugin.sockets().iter().enumerate() {
694                 wait_ctx
695                     .add(socket, Token::Plugin { index })
696                     .context("failed to add plugin sockets to wait context")?;
697             }
698         }
699 
700         let plugin_socket_count = plugin.sockets().len();
701         let events = {
702             let poll_res = match dying_instant {
703                 Some(inst) => wait_ctx.wait_timeout(duration_to_die - inst.elapsed()),
704                 None => wait_ctx.wait(),
705             };
706             match poll_res {
707                 Ok(v) => v,
708                 Err(e) => {
709                     // Polling no longer works, time to break and cleanup,
710                     if res.is_ok() {
711                         res = Err(e).context("failed to poll all FDs");
712                     }
713                     break;
714                 }
715             }
716         };
717 
718         for event in events.iter().filter(|e| e.is_hungup) {
719             if let Token::Stderr = event.token {
720                 let _ = wait_ctx.delete(&stderr_rd);
721             }
722         }
723 
724         for event in events.iter().filter(|e| e.is_readable) {
725             match event.token {
726                 Token::Exit => {
727                     // No need to check the exit event if we are already doing cleanup.
728                     let _ = wait_ctx.delete(&exit_evt);
729                     dying_instant.get_or_insert(Instant::now());
730                     let sig_res = plugin.signal_kill();
731                     if res.is_ok() && sig_res.is_err() {
732                         res = sig_res.context("error sending kill signal to plugin on exit event");
733                     }
734                 }
735                 Token::ChildSignal => {
736                     // Print all available siginfo structs, then exit the loop.
737                     loop {
738                         match sigchld_fd.read() {
739                             Ok(Some(siginfo)) => {
740                                 // If the plugin process has ended, there is no need to continue
741                                 // processing plugin connections, so we break early.
742                                 if siginfo.ssi_pid == plugin.pid() as u32 {
743                                     break 'wait;
744                                 }
745                                 // Because SIGCHLD is not expected from anything other than the
746                                 // plugin process, report it as an error.
747                                 if res.is_ok() {
748                                     res = Err(anyhow!(
749                                         "process {} died with signal {}, status {}, and code {}",
750                                         siginfo.ssi_pid,
751                                         siginfo.ssi_signo,
752                                         siginfo.ssi_status,
753                                         siginfo.ssi_code,
754                                     ));
755                                 }
756                             }
757                             Ok(None) => break, // No more signals to read.
758                             Err(e) => {
759                                 // Something really must be messed up for this to happen, continue
760                                 // processing connections for a limited time.
761                                 if res.is_ok() {
762                                     res = Err(e).context("failed to read signal fd");
763                                 }
764                                 break;
765                             }
766                         }
767                     }
768                     // As we only spawn the plugin process, getting a SIGCHLD can only mean
769                     // something went wrong.
770                     dying_instant.get_or_insert(Instant::now());
771                     let sig_res = plugin.signal_kill();
772                     if res.is_ok() && sig_res.is_err() {
773                         res = sig_res.context("error sending kill signal to plugin on SIGCHLD");
774                     }
775                 }
776                 Token::Stderr => loop {
777                     let mut buf = [0u8; 4096];
778                     match stderr_rd.read(&mut buf) {
779                         Ok(len) => {
780                             for l in String::from_utf8_lossy(&buf[0..len]).lines() {
781                                 error!("minijail/plugin: {}", l);
782                             }
783                         }
784                         Err(ref e) if e.kind() == io::ErrorKind::WouldBlock => {
785                             break;
786                         }
787                         Err(e) => {
788                             error!("failed reading from stderr: {}", e);
789                             break;
790                         }
791                     }
792                 },
793                 Token::Plugin { index } => {
794                     match plugin.handle_socket(index, &kvm, &mut vm, &vcpu_handles, &tap_interfaces)
795                     {
796                         Ok(_) => {}
797                         // A HUP is an expected event for a socket, so don't bother warning about
798                         // it.
799                         Err(CommError::PluginSocketHup) => sockets_to_drop.push(index),
800                         // Only one connection out of potentially many is broken. Drop it, but don't
801                         // start cleaning up. Because the error isn't returned, we will warn about
802                         // it here.
803                         Err(e) => {
804                             warn!("error handling plugin socket: {}", e);
805                             sockets_to_drop.push(index);
806                         }
807                     }
808                 }
809             }
810         }
811 
812         if vcpu_handles.is_empty() && dying_instant.is_none() && plugin.is_started() {
813             let res = run_vcpus(
814                 &kvm,
815                 &vm,
816                 &plugin,
817                 vcpu_count,
818                 &kill_signaled,
819                 &exit_evt,
820                 &mut vcpu_handles,
821             );
822             if let Err(e) = res {
823                 dying_instant.get_or_insert(Instant::now());
824                 error!("failed to start vcpus: {}", e);
825             }
826         }
827 
828         redo_wait_ctx_sockets =
829             !sockets_to_drop.is_empty() || plugin.sockets().len() != plugin_socket_count;
830 
831         // Cleanup all of the sockets that we have determined were disconnected or suffered some
832         // other error.
833         plugin.drop_sockets(&mut sockets_to_drop);
834         sockets_to_drop.clear();
835 
836         if redo_wait_ctx_sockets {
837             for socket in plugin.sockets() {
838                 let _ = wait_ctx.delete(socket);
839             }
840         }
841     }
842 
843     // vcpu threads MUST see the kill signaled flag, otherwise they may re-enter the VM.
844     kill_signaled.store(true, Ordering::SeqCst);
845     // Depending on how we ended up here, the plugin process, or a VCPU thread waiting for requests
846     // might be stuck. The `signal_kill` call will unstick all the VCPU threads by closing their
847     // blocked connections.
848     plugin
849         .signal_kill()
850         .context("error sending kill signal to plugin on cleanup")?;
851     for handle in vcpu_handles {
852         match handle.kill(SIGRTMIN() + 0) {
853             Ok(_) => {
854                 if let Err(e) = handle.join() {
855                     error!("failed to join vcpu thread: {:?}", e);
856                 }
857             }
858             Err(e) => error!("failed to kill vcpu thread: {}", e),
859         }
860     }
861 
862     match plugin.try_wait() {
863         // The plugin has run out of time by now
864         Ok(ProcessStatus::Running) => Err(anyhow!("plugin did not exit within timeout")),
865         // Return an error discovered earlier in this function.
866         Ok(ProcessStatus::Success) => res.map_err(anyhow::Error::msg),
867         Ok(ProcessStatus::Fail(code)) => Err(anyhow!("plugin exited with error: {}", code)),
868         Ok(ProcessStatus::Signal(code)) => Err(anyhow!("plugin exited with signal {}", code)),
869         Err(e) => Err(anyhow!("error waiting for plugin to exit: {}", e)),
870     }
871 }
872