• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2018 The ChromiumOS Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 mod process;
6 mod vcpu;
7 
8 use std::fs::File;
9 use std::io;
10 use std::io::Read;
11 use std::io::Write;
12 use std::os::unix::net::UnixDatagram;
13 use std::path::Path;
14 use std::sync::atomic::AtomicBool;
15 use std::sync::atomic::Ordering;
16 use std::sync::Arc;
17 use std::sync::Barrier;
18 use std::thread;
19 use std::time::Duration;
20 use std::time::Instant;
21 
22 use anyhow::anyhow;
23 use anyhow::bail;
24 use anyhow::Context;
25 use anyhow::Result;
26 use base::add_fd_flags;
27 use base::block_signal;
28 use base::clear_signal;
29 use base::drop_capabilities;
30 use base::enable_core_scheduling;
31 use base::error;
32 use base::getegid;
33 use base::geteuid;
34 use base::info;
35 use base::pipe;
36 use base::register_rt_signal_handler;
37 use base::validate_raw_descriptor;
38 use base::warn;
39 use base::AsRawDescriptor;
40 use base::Error as SysError;
41 use base::Event;
42 use base::EventToken;
43 use base::FromRawDescriptor;
44 use base::Killable;
45 use base::MmapError;
46 use base::RawDescriptor;
47 use base::Result as SysResult;
48 use base::SignalFd;
49 use base::WaitContext;
50 use base::SIGRTMIN;
51 use devices::virtio::NetParametersMode;
52 use jail::create_sandbox_minijail;
53 use jail::mount_proc;
54 use jail::SandboxConfig;
55 use kvm::Cap;
56 use kvm::Datamatch;
57 use kvm::IoeventAddress;
58 use kvm::Kvm;
59 use kvm::Vcpu;
60 use kvm::VcpuExit;
61 use kvm::Vm;
62 use libc::c_int;
63 use libc::c_ulong;
64 use libc::fcntl;
65 use libc::ioctl;
66 use libc::socketpair;
67 use libc::AF_UNIX;
68 use libc::EAGAIN;
69 use libc::EBADF;
70 use libc::EDEADLK;
71 use libc::EEXIST;
72 use libc::EINTR;
73 use libc::EINVAL;
74 use libc::ENOENT;
75 use libc::EOVERFLOW;
76 use libc::EPERM;
77 use libc::FIOCLEX;
78 use libc::F_SETPIPE_SZ;
79 use libc::O_NONBLOCK;
80 use libc::SIGCHLD;
81 use libc::SOCK_SEQPACKET;
82 use net_util::sys::unix::Tap;
83 use net_util::TapTCommon;
84 use protobuf::ProtobufError;
85 use remain::sorted;
86 use thiserror::Error;
87 use vm_memory::GuestMemory;
88 use vm_memory::MemoryPolicy;
89 
90 use self::process::*;
91 use self::vcpu::*;
92 use crate::crosvm::config::Executable;
93 use crate::crosvm::config::HypervisorKind;
94 use crate::Config;
95 
96 const MAX_DATAGRAM_SIZE: usize = 4096;
97 const MAX_VCPU_DATAGRAM_SIZE: usize = 0x40000;
98 const MAX_OPEN_FILES: u64 = 32768;
99 
100 /// An error that occurs when communicating with the plugin process.
101 #[sorted]
102 #[derive(Error, Debug)]
103 pub enum CommError {
104     #[error("failed to decode plugin request: {0}")]
105     DecodeRequest(ProtobufError),
106     #[error("failed to encode plugin response: {0}")]
107     EncodeResponse(ProtobufError),
108     #[error("plugin request socket has been hung up")]
109     PluginSocketHup,
110     #[error("failed to recv from plugin request socket: {0}")]
111     PluginSocketRecv(SysError),
112     #[error("failed to send to plugin request socket: {0}")]
113     PluginSocketSend(SysError),
114 }
115 
new_seqpacket_pair() -> SysResult<(UnixDatagram, UnixDatagram)>116 fn new_seqpacket_pair() -> SysResult<(UnixDatagram, UnixDatagram)> {
117     let mut fds = [0, 0];
118     unsafe {
119         let ret = socketpair(AF_UNIX, SOCK_SEQPACKET, 0, fds.as_mut_ptr());
120         if ret == 0 {
121             ioctl(fds[0], FIOCLEX);
122             Ok((
123                 UnixDatagram::from_raw_descriptor(fds[0]),
124                 UnixDatagram::from_raw_descriptor(fds[1]),
125             ))
126         } else {
127             Err(SysError::last())
128         }
129     }
130 }
131 
132 struct VcpuPipe {
133     crosvm_read: File,
134     plugin_write: File,
135     plugin_read: File,
136     crosvm_write: File,
137 }
138 
new_pipe_pair() -> SysResult<VcpuPipe>139 fn new_pipe_pair() -> SysResult<VcpuPipe> {
140     let to_crosvm = pipe(true)?;
141     let to_plugin = pipe(true)?;
142     // Increasing the pipe size can be a nice-to-have to make sure that
143     // messages get across atomically (and made sure that writes don't block),
144     // though it's not necessary a hard requirement for things to work.
145     let flags = unsafe {
146         fcntl(
147             to_crosvm.0.as_raw_descriptor(),
148             F_SETPIPE_SZ,
149             MAX_VCPU_DATAGRAM_SIZE as c_int,
150         )
151     };
152     if flags < 0 || flags != MAX_VCPU_DATAGRAM_SIZE as i32 {
153         warn!(
154             "Failed to adjust size of crosvm pipe (result {}): {}",
155             flags,
156             SysError::last()
157         );
158     }
159     let flags = unsafe {
160         fcntl(
161             to_plugin.0.as_raw_descriptor(),
162             F_SETPIPE_SZ,
163             MAX_VCPU_DATAGRAM_SIZE as c_int,
164         )
165     };
166     if flags < 0 || flags != MAX_VCPU_DATAGRAM_SIZE as i32 {
167         warn!(
168             "Failed to adjust size of plugin pipe (result {}): {}",
169             flags,
170             SysError::last()
171         );
172     }
173     Ok(VcpuPipe {
174         crosvm_read: to_crosvm.0,
175         plugin_write: to_crosvm.1,
176         plugin_read: to_plugin.0,
177         crosvm_write: to_plugin.1,
178     })
179 }
180 
proto_to_sys_err(e: ProtobufError) -> SysError181 fn proto_to_sys_err(e: ProtobufError) -> SysError {
182     match e {
183         ProtobufError::IoError(e) => SysError::new(e.raw_os_error().unwrap_or(EINVAL)),
184         _ => SysError::new(EINVAL),
185     }
186 }
187 
io_to_sys_err(e: io::Error) -> SysError188 fn io_to_sys_err(e: io::Error) -> SysError {
189     SysError::new(e.raw_os_error().unwrap_or(EINVAL))
190 }
191 
mmap_to_sys_err(e: MmapError) -> SysError192 fn mmap_to_sys_err(e: MmapError) -> SysError {
193     match e {
194         MmapError::SystemCallFailed(e) => e,
195         _ => SysError::new(EINVAL),
196     }
197 }
198 
199 /// Each `PluginObject` represents one object that was instantiated by the guest using the `Create`
200 /// request.
201 ///
202 /// Each such object has an ID associated with it that exists in an ID space shared by every variant
203 /// of `PluginObject`. This allows all the objects to be indexed in a single map, and allows for a
204 /// common destroy method.
205 ///
206 
207 /// In addition to the destory method, each object may have methods specific to its variant type.
208 /// These variant methods must be done by matching the variant to the expected type for that method.
209 /// For example, getting the dirty log from a `Memory` object starting with an ID:
210 ///
211 /// ```ignore
212 /// match objects.get(&request_id) {
213 ///    Some(&PluginObject::Memory { slot, length }) => vm.get_dirty_log(slot, &mut dirty_log[..]),
214 ///    _ => return Err(SysError::new(ENOENT)),
215 /// }
216 /// ```
217 enum PluginObject {
218     IoEvent {
219         evt: Event,
220         addr: IoeventAddress,
221         length: u32,
222         datamatch: u64,
223     },
224     Memory {
225         slot: u32,
226         length: usize,
227     },
228     IrqEvent {
229         irq_id: u32,
230         evt: Event,
231     },
232 }
233 
234 impl PluginObject {
destroy(self, vm: &mut Vm) -> SysResult<()>235     fn destroy(self, vm: &mut Vm) -> SysResult<()> {
236         match self {
237             PluginObject::IoEvent {
238                 evt,
239                 addr,
240                 length,
241                 datamatch,
242             } => match length {
243                 0 => vm.unregister_ioevent(&evt, addr, Datamatch::AnyLength),
244                 1 => vm.unregister_ioevent(&evt, addr, Datamatch::U8(Some(datamatch as u8))),
245                 2 => vm.unregister_ioevent(&evt, addr, Datamatch::U16(Some(datamatch as u16))),
246                 4 => vm.unregister_ioevent(&evt, addr, Datamatch::U32(Some(datamatch as u32))),
247                 8 => vm.unregister_ioevent(&evt, addr, Datamatch::U64(Some(datamatch as u64))),
248                 _ => Err(SysError::new(EINVAL)),
249             },
250             PluginObject::Memory { slot, .. } => vm.remove_memory_region(slot).and(Ok(())),
251             PluginObject::IrqEvent { irq_id, evt } => vm.unregister_irqfd(&evt, irq_id),
252         }
253     }
254 }
255 
run_vcpus( kvm: &Kvm, vm: &Vm, plugin: &Process, vcpu_count: u32, kill_signaled: &Arc<AtomicBool>, exit_evt: &Event, vcpu_handles: &mut Vec<thread::JoinHandle<()>>, vcpu_cgroup_tasks_file: Option<File>, ) -> Result<()>256 pub fn run_vcpus(
257     kvm: &Kvm,
258     vm: &Vm,
259     plugin: &Process,
260     vcpu_count: u32,
261     kill_signaled: &Arc<AtomicBool>,
262     exit_evt: &Event,
263     vcpu_handles: &mut Vec<thread::JoinHandle<()>>,
264     vcpu_cgroup_tasks_file: Option<File>,
265 ) -> Result<()> {
266     let vcpu_thread_barrier = Arc::new(Barrier::new((vcpu_count) as usize));
267     let use_kvm_signals = !kvm.check_extension(Cap::ImmediateExit);
268 
269     // If we need to force a vcpu to exit from a VM then a SIGRTMIN signal is sent
270     // to that vcpu's thread.  If KVM is running the VM then it'll return -EINTR.
271     // An issue is what to do when KVM isn't running the VM (where we could be
272     // in the kernel or in the app).
273     //
274     // If KVM supports "immediate exit" then we set a signal handler that will
275     // set the |immediate_exit| flag that tells KVM to return -EINTR before running
276     // the VM.
277     //
278     // If KVM doesn't support immediate exit then we'll block SIGRTMIN in the app
279     // and tell KVM to unblock SIGRTMIN before running the VM (at which point a blocked
280     // signal might get asserted).  There's overhead to have KVM unblock and re-block
281     // SIGRTMIN each time it runs the VM, so this mode should be avoided.
282 
283     if use_kvm_signals {
284         unsafe {
285             extern "C" fn handle_signal(_: c_int) {}
286             // Our signal handler does nothing and is trivially async signal safe.
287             // We need to install this signal handler even though we do block
288             // the signal below, to ensure that this signal will interrupt
289             // execution of KVM_RUN (this is implementation issue).
290             register_rt_signal_handler(SIGRTMIN() + 0, handle_signal)
291                 .expect("failed to register vcpu signal handler");
292         }
293         // We do not really want the signal handler to run...
294         block_signal(SIGRTMIN() + 0).expect("failed to block signal");
295     } else {
296         unsafe {
297             extern "C" fn handle_signal(_: c_int) {
298                 Vcpu::set_local_immediate_exit(true);
299             }
300             register_rt_signal_handler(SIGRTMIN() + 0, handle_signal)
301                 .expect("failed to register vcpu signal handler");
302         }
303     }
304 
305     for cpu_id in 0..vcpu_count {
306         let kill_signaled = kill_signaled.clone();
307         let vcpu_thread_barrier = vcpu_thread_barrier.clone();
308         let vcpu_exit_evt = exit_evt.try_clone().context("failed to clone event")?;
309         let vcpu_plugin = plugin.create_vcpu(cpu_id)?;
310         let vcpu = Vcpu::new(cpu_id as c_ulong, kvm, vm).context("error creating vcpu")?;
311         let vcpu_cgroup_tasks_file = vcpu_cgroup_tasks_file
312             .as_ref()
313             .map(|f| f.try_clone().unwrap());
314 
315         vcpu_handles.push(
316             thread::Builder::new()
317                 .name(format!("crosvm_vcpu{}", cpu_id))
318                 .spawn(move || {
319                     if use_kvm_signals {
320                         // Tell KVM to not block anything when entering kvm run
321                         // because we will be using first RT signal to kick the VCPU.
322                         vcpu.set_signal_mask(&[])
323                             .expect("failed to set up KVM VCPU signal mask");
324                     }
325                     // Move vcpu thread to cgroup
326                     if let Some(mut f) = vcpu_cgroup_tasks_file {
327                         f.write_all(base::gettid().to_string().as_bytes()).unwrap();
328                     }
329 
330                     if let Err(e) = enable_core_scheduling() {
331                         error!("Failed to enable core scheduling: {}", e);
332                     }
333 
334                     let vcpu = vcpu
335                         .to_runnable(Some(SIGRTMIN() + 0))
336                         .expect("Failed to set thread id");
337 
338                     let res = vcpu_plugin.init(&vcpu);
339                     vcpu_thread_barrier.wait();
340                     if let Err(e) = res {
341                         error!("failed to initialize vcpu {}: {}", cpu_id, e);
342                     } else {
343                         loop {
344                             let mut interrupted_by_signal = false;
345                             let run_res = vcpu.run();
346                             match run_res {
347                                 Ok(run) => match run {
348                                     VcpuExit::IoIn { port, mut size } => {
349                                         let mut data = [0; 256];
350                                         if size > data.len() {
351                                             error!(
352                                                 "unsupported IoIn size of {} bytes at port {:#x}",
353                                                 size, port
354                                             );
355                                             size = data.len();
356                                         }
357                                         vcpu_plugin.io_read(port as u64, &mut data[..size], &vcpu);
358                                         if let Err(e) = vcpu.set_data(&data[..size]) {
359                                             error!(
360                                                 "failed to set return data for IoIn at port {:#x}: {}",
361                                                 port, e
362                                             );
363                                         }
364                                     }
365                                     VcpuExit::IoOut {
366                                         port,
367                                         mut size,
368                                         data,
369                                     } => {
370                                         if size > data.len() {
371                                             error!("unsupported IoOut size of {} bytes at port {:#x}", size, port);
372                                             size = data.len();
373                                         }
374                                         vcpu_plugin.io_write(port as u64, &data[..size], &vcpu);
375                                     }
376                                     VcpuExit::MmioRead { address, size } => {
377                                         let mut data = [0; 8];
378                                         vcpu_plugin.mmio_read(
379                                             address as u64,
380                                             &mut data[..size],
381                                             &vcpu,
382                                         );
383                                         // Setting data for mmio can not fail.
384                                         let _ = vcpu.set_data(&data[..size]);
385                                     }
386                                     VcpuExit::MmioWrite {
387                                         address,
388                                         size,
389                                         data,
390                                     } => {
391                                         vcpu_plugin.mmio_write(
392                                             address as u64,
393                                             &data[..size],
394                                             &vcpu,
395                                         );
396                                     }
397                                     VcpuExit::HypervHcall { input, params } => {
398                                         let mut data = [0; 8];
399                                         vcpu_plugin.hyperv_call(input, params, &mut data, &vcpu);
400                                         // Setting data for hyperv call can not fail.
401                                         let _ = vcpu.set_data(&data);
402                                     }
403                                     VcpuExit::HypervSynic {
404                                         msr,
405                                         control,
406                                         evt_page,
407                                         msg_page,
408                                     } => {
409                                         vcpu_plugin
410                                             .hyperv_synic(msr, control, evt_page, msg_page, &vcpu);
411                                     }
412                                     VcpuExit::Hlt => break,
413                                     VcpuExit::Shutdown => break,
414                                     VcpuExit::InternalError => {
415                                         error!("vcpu {} has internal error", cpu_id);
416                                         break;
417                                     }
418                                     r => warn!("unexpected vcpu exit: {:?}", r),
419                                 },
420                                 Err(e) => match e.errno() {
421                                     EINTR => interrupted_by_signal = true,
422                                     EAGAIN => {}
423                                     _ => {
424                                         error!("vcpu hit unknown error: {}", e);
425                                         break;
426                                     }
427                                 },
428                             }
429                             if kill_signaled.load(Ordering::SeqCst) {
430                                 break;
431                             }
432 
433                             // Only handle the pause request if kvm reported that it was
434                             // interrupted by a signal.  This helps to entire that KVM has had a chance
435                             // to finish emulating any IO that may have immediately happened.
436                             // If we eagerly check pre_run() then any IO that we
437                             // just reported to the plugin won't have been processed yet by KVM.
438                             // Not eagerly calling pre_run() also helps to reduce
439                             // any overhead from checking if a pause request is pending.
440                             // The assumption is that pause requests aren't common
441                             // or frequent so it's better to optimize for the non-pause execution paths.
442                             if interrupted_by_signal {
443                                 if use_kvm_signals {
444                                     clear_signal(SIGRTMIN() + 0)
445                                         .expect("failed to clear pending signal");
446                                 } else {
447                                     vcpu.set_immediate_exit(false);
448                                 }
449 
450                                 if let Err(e) = vcpu_plugin.pre_run(&vcpu) {
451                                     error!("failed to process pause on vcpu {}: {}", cpu_id, e);
452                                     break;
453                                 }
454                             }
455                         }
456                     }
457                     vcpu_exit_evt
458                         .signal()
459                         .expect("failed to signal vcpu exit event");
460                 })
461                 .context("error spawning vcpu thread")?,
462         );
463     }
464     Ok(())
465 }
466 
467 #[derive(EventToken)]
468 enum Token {
469     Exit,
470     ChildSignal,
471     Stderr,
472     Plugin { index: usize },
473 }
474 
475 /// Run a VM with a plugin process specified by `cfg`.
476 ///
477 /// Not every field of `cfg` will be used. In particular, most field that pertain to a specific
478 /// device are ignored because the plugin is responsible for emulating hardware.
run_config(cfg: Config) -> Result<()>479 pub fn run_config(cfg: Config) -> Result<()> {
480     info!("crosvm starting plugin process");
481 
482     // Masking signals is inherently dangerous, since this can persist across clones/execs. Do this
483     // before any jailed devices have been spawned, so that we can catch any of them that fail very
484     // quickly.
485     let sigchld_fd = SignalFd::new(SIGCHLD).context("failed to create signalfd")?;
486 
487     // Create a pipe to capture error messages from plugin and minijail.
488     let (mut stderr_rd, stderr_wr) = pipe(true).context("failed to create stderr pipe")?;
489     add_fd_flags(stderr_rd.as_raw_descriptor(), O_NONBLOCK)
490         .context("error marking stderr nonblocking")?;
491 
492     let jail = if let Some(jail_config) = &cfg.jail_config {
493         if jail_config.seccomp_policy_dir.is_none() {
494             bail!("plugin requires seccomp policy file specified.");
495         }
496 
497         let mut config = SandboxConfig::new(jail_config, "plugin");
498         config.bind_mounts = true;
499         let uid_map = format!("0 {} 1", geteuid());
500         let gid_map = format!("0 {} 1", getegid());
501         let gid_map = if cfg.plugin_gid_maps.len() > 0 {
502             gid_map
503                 + &cfg
504                     .plugin_gid_maps
505                     .into_iter()
506                     .map(|m| format!(",{} {} {}", m.inner, m.outer, m.count))
507                     .collect::<String>()
508         } else {
509             gid_map
510         };
511         config.ugid_map = Some((&uid_map, &gid_map));
512 
513         let root_path = cfg.plugin_root.as_ref().unwrap_or(&jail_config.pivot_root);
514         let mut jail = create_sandbox_minijail(root_path, MAX_OPEN_FILES, &config)
515             .context("create plugin sandbox")?;
516 
517         // Because we requested to "run as init", minijail will not mount /proc for us even though
518         // plugin will be running in its own PID namespace, so we have to mount it ourselves.
519         mount_proc(&mut jail).context("mount proc")?;
520 
521         // Mount minimal set of devices (full, zero, urandom, etc). We can not use
522         // jail.mount_dev() here because crosvm may not be running with CAP_SYS_ADMIN.
523         let device_names = ["full", "null", "urandom", "zero"];
524         for name in &device_names {
525             let device = Path::new("/dev").join(name);
526             jail.mount_bind(&device, &device, true)
527                 .context("failed to mount dev")?;
528         }
529 
530         for bind_mount in &cfg.plugin_mounts {
531             jail.mount_bind(&bind_mount.src, &bind_mount.dst, bind_mount.writable)
532                 .with_context(|| {
533                     format!(
534                         "failed to bind mount {} -> {} as {} ",
535                         bind_mount.src.display(),
536                         bind_mount.dst.display(),
537                         if bind_mount.writable {
538                             "writable"
539                         } else {
540                             "read only"
541                         }
542                     )
543                 })?;
544         }
545 
546         Some(jail)
547     } else {
548         None
549     };
550 
551     let mut tap_interfaces: Vec<Tap> = Vec::new();
552     for net_params in cfg.net {
553         if net_params.vhost_net {
554             bail!("vhost-net not supported with plugin");
555         }
556 
557         match net_params.mode {
558             NetParametersMode::RawConfig {
559                 host_ip,
560                 netmask,
561                 mac,
562             } => {
563                 let tap = Tap::new(false, false).context("error opening tap device")?;
564                 tap.set_ip_addr(host_ip).context("error setting tap ip")?;
565                 tap.set_netmask(netmask)
566                     .context("error setting tap netmask")?;
567                 tap.set_mac_address(mac)
568                     .context("error setting tap mac address")?;
569 
570                 tap.enable().context("error enabling tap device")?;
571                 tap_interfaces.push(tap);
572             }
573             NetParametersMode::TapName { tap_name, mac } => {
574                 let tap = Tap::new_with_name(tap_name.as_bytes(), true, false)
575                     .context("failed to create tap device from name")?;
576                 if let Some(mac) = mac {
577                     tap.set_mac_address(mac)
578                         .context("error setting tap mac addres")?;
579                 }
580                 tap_interfaces.push(tap);
581             }
582             NetParametersMode::TapFd { tap_fd, mac } => {
583                 // Safe because we ensure that we get a unique handle to the fd.
584                 let tap = unsafe {
585                     Tap::from_raw_descriptor(
586                         validate_raw_descriptor(tap_fd).context("failed to validate raw tap fd")?,
587                     )
588                     .context("failed to create tap device from raw fd")?
589                 };
590                 if let Some(mac) = mac {
591                     tap.set_mac_address(mac)
592                         .context("error setting tap mac addres")?;
593                 }
594                 tap_interfaces.push(tap);
595             }
596         }
597     }
598 
599     let plugin_args: Vec<&str> = cfg.params.iter().map(|s| &s[..]).collect();
600 
601     let plugin_path = match cfg.executable_path {
602         Some(Executable::Plugin(ref plugin_path)) => plugin_path.as_path(),
603         _ => panic!("Executable was not a plugin"),
604     };
605     let vcpu_count = cfg.vcpu_count.unwrap_or(1) as u32;
606     let mem = GuestMemory::new(&[]).unwrap();
607     let mut mem_policy = MemoryPolicy::empty();
608     if cfg.hugepages {
609         mem_policy |= MemoryPolicy::USE_HUGEPAGES;
610     }
611     mem.set_memory_policy(mem_policy);
612 
613     let kvm_device_path = if let Some(HypervisorKind::Kvm { device }) = &cfg.hypervisor {
614         device.as_deref()
615     } else {
616         None
617     };
618 
619     let kvm_device_path = kvm_device_path.unwrap_or(Path::new("/dev/kvm"));
620     let kvm = Kvm::new_with_path(kvm_device_path).context("error creating Kvm")?;
621     let mut vm = Vm::new(&kvm, mem).context("error creating vm")?;
622     vm.create_irq_chip()
623         .context("failed to create kvm irqchip")?;
624     vm.create_pit().context("failed to create kvm PIT")?;
625 
626     let mut plugin = Process::new(vcpu_count, plugin_path, &plugin_args, jail, stderr_wr)?;
627     // Now that the jail for the plugin has been created and we had a chance to adjust gids there,
628     // we can drop all our capabilities in case we had any.
629     drop_capabilities().context("failed to drop process capabilities")?;
630 
631     let mut res = Ok(());
632     // If Some, we will exit after enough time is passed to shutdown cleanly.
633     let mut dying_instant: Option<Instant> = None;
634     let duration_to_die = Duration::from_millis(1000);
635 
636     let exit_evt = Event::new().context("failed to create event")?;
637     let kill_signaled = Arc::new(AtomicBool::new(false));
638     let mut vcpu_handles = Vec::with_capacity(vcpu_count as usize);
639 
640     let wait_ctx = WaitContext::build_with(&[
641         (&exit_evt, Token::Exit),
642         (&sigchld_fd, Token::ChildSignal),
643         (&stderr_rd, Token::Stderr),
644     ])
645     .context("failed to add control descriptors to wait context")?;
646 
647     let mut sockets_to_drop = Vec::new();
648     let mut redo_wait_ctx_sockets = true;
649     // In this loop, make every attempt to not return early. If an error is encountered, set `res`
650     // to the error, set `dying_instant` to now, and signal the plugin that it will be killed soon.
651     // If the plugin cannot be signaled because it is dead of `signal_kill` failed, simply break
652     // from the poll loop so that the VCPU threads can be cleaned up.
653     'wait: loop {
654         // After we have waited long enough, it's time to give up and exit.
655         if dying_instant
656             .map(|i| i.elapsed() >= duration_to_die)
657             .unwrap_or(false)
658         {
659             break;
660         }
661 
662         if redo_wait_ctx_sockets {
663             for (index, socket) in plugin.sockets().iter().enumerate() {
664                 wait_ctx
665                     .add(socket, Token::Plugin { index })
666                     .context("failed to add plugin sockets to wait context")?;
667             }
668         }
669 
670         let plugin_socket_count = plugin.sockets().len();
671         let events = {
672             let poll_res = match dying_instant {
673                 Some(inst) => wait_ctx.wait_timeout(duration_to_die - inst.elapsed()),
674                 None => wait_ctx.wait(),
675             };
676             match poll_res {
677                 Ok(v) => v,
678                 Err(e) => {
679                     // Polling no longer works, time to break and cleanup,
680                     if res.is_ok() {
681                         res = Err(e).context("failed to poll all FDs");
682                     }
683                     break;
684                 }
685             }
686         };
687 
688         for event in events.iter().filter(|e| e.is_hungup) {
689             if let Token::Stderr = event.token {
690                 let _ = wait_ctx.delete(&stderr_rd);
691             }
692         }
693 
694         for event in events.iter().filter(|e| e.is_readable) {
695             match event.token {
696                 Token::Exit => {
697                     // No need to check the exit event if we are already doing cleanup.
698                     let _ = wait_ctx.delete(&exit_evt);
699                     dying_instant.get_or_insert(Instant::now());
700                     let sig_res = plugin.signal_kill();
701                     if res.is_ok() && sig_res.is_err() {
702                         res = sig_res.context("error sending kill signal to plugin on exit event");
703                     }
704                 }
705                 Token::ChildSignal => {
706                     // Print all available siginfo structs, then exit the loop.
707                     loop {
708                         match sigchld_fd.read() {
709                             Ok(Some(siginfo)) => {
710                                 // If the plugin process has ended, there is no need to continue
711                                 // processing plugin connections, so we break early.
712                                 if siginfo.ssi_pid == plugin.pid() as u32 {
713                                     break 'wait;
714                                 }
715                                 // Because SIGCHLD is not expected from anything other than the
716                                 // plugin process, report it as an error.
717                                 if res.is_ok() {
718                                     res = Err(anyhow!(
719                                         "process {} died with signal {}, status {}, and code {}",
720                                         siginfo.ssi_pid,
721                                         siginfo.ssi_signo,
722                                         siginfo.ssi_status,
723                                         siginfo.ssi_code,
724                                     ));
725                                 }
726                             }
727                             Ok(None) => break, // No more signals to read.
728                             Err(e) => {
729                                 // Something really must be messed up for this to happen, continue
730                                 // processing connections for a limited time.
731                                 if res.is_ok() {
732                                     res = Err(e).context("failed to read signal fd");
733                                 }
734                                 break;
735                             }
736                         }
737                     }
738                     // As we only spawn the plugin process, getting a SIGCHLD can only mean
739                     // something went wrong.
740                     dying_instant.get_or_insert(Instant::now());
741                     let sig_res = plugin.signal_kill();
742                     if res.is_ok() && sig_res.is_err() {
743                         res = sig_res.context("error sending kill signal to plugin on SIGCHLD");
744                     }
745                 }
746                 Token::Stderr => loop {
747                     let mut buf = [0u8; 4096];
748                     match stderr_rd.read(&mut buf) {
749                         Ok(len) => {
750                             for l in String::from_utf8_lossy(&buf[0..len]).lines() {
751                                 error!("minijail/plugin: {}", l);
752                             }
753                         }
754                         Err(ref e) if e.kind() == io::ErrorKind::WouldBlock => {
755                             break;
756                         }
757                         Err(e) => {
758                             error!("failed reading from stderr: {}", e);
759                             break;
760                         }
761                     }
762                 },
763                 Token::Plugin { index } => {
764                     match plugin.handle_socket(index, &kvm, &mut vm, &vcpu_handles, &tap_interfaces)
765                     {
766                         Ok(_) => {}
767                         // A HUP is an expected event for a socket, so don't bother warning about
768                         // it.
769                         Err(CommError::PluginSocketHup) => sockets_to_drop.push(index),
770                         // Only one connection out of potentially many is broken. Drop it, but don't
771                         // start cleaning up. Because the error isn't returned, we will warn about
772                         // it here.
773                         Err(e) => {
774                             warn!("error handling plugin socket: {}", e);
775                             sockets_to_drop.push(index);
776                         }
777                     }
778                 }
779             }
780         }
781 
782         if vcpu_handles.is_empty() && dying_instant.is_none() && plugin.is_started() {
783             let vcpu_cgroup_tasks_file = match &cfg.vcpu_cgroup_path {
784                 None => None,
785                 Some(cgroup_path) => {
786                     // Move main process to cgroup_path
787                     let mut f = File::create(&cgroup_path.join("tasks"))?;
788                     f.write_all(std::process::id().to_string().as_bytes())?;
789                     Some(f)
790                 }
791             };
792 
793             let res = run_vcpus(
794                 &kvm,
795                 &vm,
796                 &plugin,
797                 vcpu_count,
798                 &kill_signaled,
799                 &exit_evt,
800                 &mut vcpu_handles,
801                 vcpu_cgroup_tasks_file,
802             );
803             if let Err(e) = res {
804                 dying_instant.get_or_insert(Instant::now());
805                 error!("failed to start vcpus: {}", e);
806             }
807         }
808 
809         redo_wait_ctx_sockets =
810             !sockets_to_drop.is_empty() || plugin.sockets().len() != plugin_socket_count;
811 
812         // Cleanup all of the sockets that we have determined were disconnected or suffered some
813         // other error.
814         plugin.drop_sockets(&mut sockets_to_drop);
815         sockets_to_drop.clear();
816 
817         if redo_wait_ctx_sockets {
818             for socket in plugin.sockets() {
819                 let _ = wait_ctx.delete(socket);
820             }
821         }
822     }
823 
824     // vcpu threads MUST see the kill signaled flag, otherwise they may re-enter the VM.
825     kill_signaled.store(true, Ordering::SeqCst);
826     // Depending on how we ended up here, the plugin process, or a VCPU thread waiting for requests
827     // might be stuck. The `signal_kill` call will unstick all the VCPU threads by closing their
828     // blocked connections.
829     plugin
830         .signal_kill()
831         .context("error sending kill signal to plugin on cleanup")?;
832     for handle in vcpu_handles {
833         match handle.kill(SIGRTMIN() + 0) {
834             Ok(_) => {
835                 if let Err(e) = handle.join() {
836                     error!("failed to join vcpu thread: {:?}", e);
837                 }
838             }
839             Err(e) => error!("failed to kill vcpu thread: {}", e),
840         }
841     }
842 
843     match plugin.try_wait() {
844         // The plugin has run out of time by now
845         Ok(ProcessStatus::Running) => Err(anyhow!("plugin did not exit within timeout")),
846         // Return an error discovered earlier in this function.
847         Ok(ProcessStatus::Success) => res.map_err(anyhow::Error::msg),
848         Ok(ProcessStatus::Fail(code)) => Err(anyhow!("plugin exited with error: {}", code)),
849         Ok(ProcessStatus::Signal(code)) => Err(anyhow!("plugin exited with signal {}", code)),
850         Err(e) => Err(anyhow!("error waiting for plugin to exit: {}", e)),
851     }
852 }
853