1 // Copyright 2018 The Chromium OS Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 mod process;
6 mod vcpu;
7
8 use std::fmt::{self, Display};
9 use std::fs::File;
10 use std::io;
11 use std::os::unix::io::{AsRawFd, FromRawFd, IntoRawFd};
12 use std::os::unix::net::UnixDatagram;
13 use std::path::Path;
14 use std::result;
15 use std::sync::atomic::{AtomicBool, Ordering};
16 use std::sync::{Arc, Barrier};
17 use std::thread;
18 use std::time::{Duration, Instant};
19
20 use libc::{
21 c_int, c_ulong, fcntl, ioctl, socketpair, AF_UNIX, EAGAIN, EBADF, EDEADLK, EEXIST, EINTR,
22 EINVAL, ENOENT, EOVERFLOW, EPERM, FIOCLEX, F_SETPIPE_SZ, MS_NODEV, MS_NOEXEC, MS_NOSUID,
23 SIGCHLD, SOCK_SEQPACKET,
24 };
25
26 use protobuf::ProtobufError;
27 use remain::sorted;
28
29 use io_jail::{self, Minijail};
30 use kvm::{Datamatch, IoeventAddress, Kvm, Vcpu, VcpuExit, Vm};
31 use net_util::{Error as TapError, Tap, TapT};
32 use sys_util::{
33 block_signal, clear_signal, drop_capabilities, error, getegid, geteuid, info, pipe,
34 register_signal_handler, validate_raw_fd, warn, Error as SysError, EventFd, GuestMemory,
35 Killable, MmapError, PollContext, PollToken, Result as SysResult, SignalFd, SignalFdError,
36 SIGRTMIN,
37 };
38
39 use self::process::*;
40 use self::vcpu::*;
41 use crate::{Config, Executable};
42
43 const MAX_DATAGRAM_SIZE: usize = 4096;
44 const MAX_VCPU_DATAGRAM_SIZE: usize = 0x40000;
45
46 /// An error that occurs during the lifetime of a plugin process.
47 #[sorted]
48 pub enum Error {
49 CloneEventFd(SysError),
50 CloneVcpuPipe(io::Error),
51 CreateEventFd(SysError),
52 CreateIrqChip(SysError),
53 CreateJail(io_jail::Error),
54 CreateKvm(SysError),
55 CreateMainSocket(SysError),
56 CreatePIT(SysError),
57 CreatePollContext(SysError),
58 CreateSignalFd(SignalFdError),
59 CreateSocketPair(io::Error),
60 CreateTapFd(TapError),
61 CreateVcpu(SysError),
62 CreateVcpuSocket(SysError),
63 CreateVm(SysError),
64 DecodeRequest(ProtobufError),
65 DropCapabilities(SysError),
66 EncodeResponse(ProtobufError),
67 Mount(io_jail::Error),
68 MountDev(io_jail::Error),
69 MountLib(io_jail::Error),
70 MountLib64(io_jail::Error),
71 MountPlugin(io_jail::Error),
72 MountPluginLib(io_jail::Error),
73 MountRoot(io_jail::Error),
74 NoRootDir,
75 ParsePivotRoot(io_jail::Error),
76 ParseSeccomp(io_jail::Error),
77 PluginFailed(i32),
78 PluginKill(SysError),
79 PluginKilled(i32),
80 PluginRunJail(io_jail::Error),
81 PluginSocketHup,
82 PluginSocketPoll(SysError),
83 PluginSocketRecv(SysError),
84 PluginSocketSend(SysError),
85 PluginSpawn(io::Error),
86 PluginTimeout,
87 PluginWait(SysError),
88 Poll(SysError),
89 PollContextAdd(SysError),
90 RootNotAbsolute,
91 RootNotDir,
92 SetGidMap(io_jail::Error),
93 SetUidMap(io_jail::Error),
94 SigChild {
95 pid: u32,
96 signo: u32,
97 status: i32,
98 code: i32,
99 },
100 SignalFd(SignalFdError),
101 SpawnVcpu(io::Error),
102 TapEnable(TapError),
103 TapOpen(TapError),
104 TapSetIp(TapError),
105 TapSetMacAddress(TapError),
106 TapSetNetmask(TapError),
107 ValidateTapFd(SysError),
108 }
109
110 impl Display for Error {
111 #[remain::check]
fmt(&self, f: &mut fmt::Formatter) -> fmt::Result112 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
113 use self::Error::*;
114
115 #[sorted]
116 match self {
117 CloneEventFd(e) => write!(f, "failed to clone eventfd: {}", e),
118 CloneVcpuPipe(e) => write!(f, "failed to clone vcpu pipe: {}", e),
119 CreateEventFd(e) => write!(f, "failed to create eventfd: {}", e),
120 CreateIrqChip(e) => write!(f, "failed to create kvm irqchip: {}", e),
121 CreateJail(e) => write!(f, "failed to create jail: {}", e),
122 CreateKvm(e) => write!(f, "error creating Kvm: {}", e),
123 CreateMainSocket(e) => write!(f, "error creating main request socket: {}", e),
124 CreatePIT(e) => write!(f, "failed to create kvm PIT: {}", e),
125 CreatePollContext(e) => write!(f, "failed to create poll context: {}", e),
126 CreateSignalFd(e) => write!(f, "failed to create signalfd: {}", e),
127 CreateSocketPair(e) => write!(f, "failed to create socket pair: {}", e),
128 CreateTapFd(e) => write!(f, "failed to create tap device from raw fd: {}", e),
129 CreateVcpu(e) => write!(f, "error creating vcpu: {}", e),
130 CreateVcpuSocket(e) => write!(f, "error creating vcpu request socket: {}", e),
131 CreateVm(e) => write!(f, "error creating vm: {}", e),
132 DecodeRequest(e) => write!(f, "failed to decode plugin request: {}", e),
133 DropCapabilities(e) => write!(f, "failed to drop process capabilities: {}", e),
134 EncodeResponse(e) => write!(f, "failed to encode plugin response: {}", e),
135 Mount(e) | MountDev(e) | MountLib(e) | MountLib64(e) | MountPlugin(e)
136 | MountPluginLib(e) | MountRoot(e) => write!(f, "failed to mount: {}", e),
137 NoRootDir => write!(f, "no root directory for jailed process to pivot root into"),
138 ParsePivotRoot(e) => write!(f, "failed to set jail pivot root: {}", e),
139 ParseSeccomp(e) => write!(f, "failed to parse jail seccomp filter: {}", e),
140 PluginFailed(e) => write!(f, "plugin exited with error: {}", e),
141 PluginKill(e) => write!(f, "error sending kill signal to plugin: {}", e),
142 PluginKilled(e) => write!(f, "plugin exited with signal {}", e),
143 PluginRunJail(e) => write!(f, "failed to run jail: {}", e),
144 PluginSocketHup => write!(f, "plugin request socket has been hung up"),
145 PluginSocketPoll(e) => write!(f, "failed to poll plugin request sockets: {}", e),
146 PluginSocketRecv(e) => write!(f, "failed to recv from plugin request socket: {}", e),
147 PluginSocketSend(e) => write!(f, "failed to send to plugin request socket: {}", e),
148 PluginSpawn(e) => write!(f, "failed to spawn plugin: {}", e),
149 PluginTimeout => write!(f, "plugin did not exit within timeout"),
150 PluginWait(e) => write!(f, "error waiting for plugin to exit: {}", e),
151 Poll(e) => write!(f, "failed to poll all FDs: {}", e),
152 PollContextAdd(e) => write!(f, "failed to add fd to poll context: {}", e),
153 RootNotAbsolute => write!(f, "path to the root directory must be absolute"),
154 RootNotDir => write!(f, "specified root directory is not a directory"),
155 SetGidMap(e) => write!(f, "failed to set gidmap for jail: {}", e),
156 SetUidMap(e) => write!(f, "failed to set uidmap for jail: {}", e),
157 SigChild {
158 pid,
159 signo,
160 status,
161 code,
162 } => write!(
163 f,
164 "process {} died with signal {}, status {}, and code {}",
165 pid, signo, status, code
166 ),
167 SignalFd(e) => write!(f, "failed to read signal fd: {}", e),
168 SpawnVcpu(e) => write!(f, "error spawning vcpu thread: {}", e),
169 TapEnable(e) => write!(f, "error enabling tap device: {}", e),
170 TapOpen(e) => write!(f, "error opening tap device: {}", e),
171 TapSetIp(e) => write!(f, "error setting tap ip: {}", e),
172 TapSetMacAddress(e) => write!(f, "error setting tap mac address: {}", e),
173 TapSetNetmask(e) => write!(f, "error setting tap netmask: {}", e),
174 ValidateTapFd(e) => write!(f, "failed to validate raw tap fd: {}", e),
175 }
176 }
177 }
178
179 type Result<T> = result::Result<T, Error>;
180
downcast_file<F: IntoRawFd>(f: F) -> File181 fn downcast_file<F: IntoRawFd>(f: F) -> File {
182 unsafe { File::from_raw_fd(f.into_raw_fd()) }
183 }
184
new_seqpacket_pair() -> SysResult<(UnixDatagram, UnixDatagram)>185 fn new_seqpacket_pair() -> SysResult<(UnixDatagram, UnixDatagram)> {
186 let mut fds = [0, 0];
187 unsafe {
188 let ret = socketpair(AF_UNIX, SOCK_SEQPACKET, 0, fds.as_mut_ptr());
189 if ret == 0 {
190 ioctl(fds[0], FIOCLEX);
191 Ok((
192 UnixDatagram::from_raw_fd(fds[0]),
193 UnixDatagram::from_raw_fd(fds[1]),
194 ))
195 } else {
196 Err(SysError::last())
197 }
198 }
199 }
200
201 struct VcpuPipe {
202 crosvm_read: File,
203 plugin_write: File,
204 plugin_read: File,
205 crosvm_write: File,
206 }
207
new_pipe_pair() -> SysResult<VcpuPipe>208 fn new_pipe_pair() -> SysResult<VcpuPipe> {
209 let to_crosvm = pipe(true)?;
210 let to_plugin = pipe(true)?;
211 // Increasing the pipe size can be a nice-to-have to make sure that
212 // messages get across atomically (and made sure that writes don't block),
213 // though it's not necessary a hard requirement for things to work.
214 let flags = unsafe {
215 fcntl(
216 to_crosvm.0.as_raw_fd(),
217 F_SETPIPE_SZ,
218 MAX_VCPU_DATAGRAM_SIZE as c_int,
219 )
220 };
221 if flags < 0 || flags != MAX_VCPU_DATAGRAM_SIZE as i32 {
222 warn!(
223 "Failed to adjust size of crosvm pipe (result {}): {}",
224 flags,
225 SysError::last()
226 );
227 }
228 let flags = unsafe {
229 fcntl(
230 to_plugin.0.as_raw_fd(),
231 F_SETPIPE_SZ,
232 MAX_VCPU_DATAGRAM_SIZE as c_int,
233 )
234 };
235 if flags < 0 || flags != MAX_VCPU_DATAGRAM_SIZE as i32 {
236 warn!(
237 "Failed to adjust size of plugin pipe (result {}): {}",
238 flags,
239 SysError::last()
240 );
241 }
242 Ok(VcpuPipe {
243 crosvm_read: to_crosvm.0,
244 plugin_write: to_crosvm.1,
245 plugin_read: to_plugin.0,
246 crosvm_write: to_plugin.1,
247 })
248 }
249
proto_to_sys_err(e: ProtobufError) -> SysError250 fn proto_to_sys_err(e: ProtobufError) -> SysError {
251 match e {
252 ProtobufError::IoError(e) => SysError::new(e.raw_os_error().unwrap_or(EINVAL)),
253 _ => SysError::new(EINVAL),
254 }
255 }
256
io_to_sys_err(e: io::Error) -> SysError257 fn io_to_sys_err(e: io::Error) -> SysError {
258 SysError::new(e.raw_os_error().unwrap_or(EINVAL))
259 }
260
mmap_to_sys_err(e: MmapError) -> SysError261 fn mmap_to_sys_err(e: MmapError) -> SysError {
262 match e {
263 MmapError::SystemCallFailed(e) => e,
264 _ => SysError::new(EINVAL),
265 }
266 }
267
create_plugin_jail(root: &Path, seccomp_policy: &Path) -> Result<Minijail>268 fn create_plugin_jail(root: &Path, seccomp_policy: &Path) -> Result<Minijail> {
269 // All child jails run in a new user namespace without any users mapped,
270 // they run as nobody unless otherwise configured.
271 let mut j = Minijail::new().map_err(Error::CreateJail)?;
272 j.namespace_pids();
273 j.namespace_user();
274 j.uidmap(&format!("0 {0} 1", geteuid()))
275 .map_err(Error::SetUidMap)?;
276 j.gidmap(&format!("0 {0} 1", getegid()))
277 .map_err(Error::SetGidMap)?;
278 j.namespace_user_disable_setgroups();
279 // Don't need any capabilities.
280 j.use_caps(0);
281 // Create a new mount namespace with an empty root FS.
282 j.namespace_vfs();
283 j.enter_pivot_root(root).map_err(Error::ParsePivotRoot)?;
284 // Run in an empty network namespace.
285 j.namespace_net();
286 j.no_new_privs();
287 // Use TSYNC only for the side effect of it using SECCOMP_RET_TRAP, which will correctly kill
288 // the entire plugin process if a worker thread commits a seccomp violation.
289 j.set_seccomp_filter_tsync();
290 #[cfg(debug_assertions)]
291 j.log_seccomp_filter_failures();
292 j.parse_seccomp_filters(seccomp_policy)
293 .map_err(Error::ParseSeccomp)?;
294 j.use_seccomp_filter();
295 // Don't do init setup.
296 j.run_as_init();
297
298 // Create a tmpfs in the plugin's root directory so that we can bind mount it's executable
299 // file into it. The size=67108864 is size=64*1024*1024 or size=64MB.
300 j.mount_with_data(
301 Path::new("none"),
302 Path::new("/"),
303 "tmpfs",
304 (MS_NOSUID | MS_NODEV | MS_NOEXEC) as usize,
305 "size=67108864",
306 )
307 .map_err(Error::MountRoot)?;
308
309 Ok(j)
310 }
311
312 /// Each `PluginObject` represents one object that was instantiated by the guest using the `Create`
313 /// request.
314 ///
315 /// Each such object has an ID associated with it that exists in an ID space shared by every variant
316 /// of `PluginObject`. This allows all the objects to be indexed in a single map, and allows for a
317 /// common destroy method.
318 ///
319
320 /// In addition to the destory method, each object may have methods specific to its variant type.
321 /// These variant methods must be done by matching the variant to the expected type for that method.
322 /// For example, getting the dirty log from a `Memory` object starting with an ID:
323 ///
324 /// ```
325 /// match objects.get(&request_id) {
326 /// Some(&PluginObject::Memory { slot, length }) => vm.get_dirty_log(slot, &mut dirty_log[..])
327 /// _ => return Err(SysError::new(ENOENT)),
328 /// }
329 /// ```
330 enum PluginObject {
331 IoEvent {
332 evt: EventFd,
333 addr: IoeventAddress,
334 length: u32,
335 datamatch: u64,
336 },
337 Memory {
338 slot: u32,
339 length: usize,
340 },
341 IrqEvent {
342 irq_id: u32,
343 evt: EventFd,
344 },
345 }
346
347 impl PluginObject {
destroy(self, vm: &mut Vm) -> SysResult<()>348 fn destroy(self, vm: &mut Vm) -> SysResult<()> {
349 match self {
350 PluginObject::IoEvent {
351 evt,
352 addr,
353 length,
354 datamatch,
355 } => match length {
356 0 => vm.unregister_ioevent(&evt, addr, Datamatch::AnyLength),
357 1 => vm.unregister_ioevent(&evt, addr, Datamatch::U8(Some(datamatch as u8))),
358 2 => vm.unregister_ioevent(&evt, addr, Datamatch::U16(Some(datamatch as u16))),
359 4 => vm.unregister_ioevent(&evt, addr, Datamatch::U32(Some(datamatch as u32))),
360 8 => vm.unregister_ioevent(&evt, addr, Datamatch::U64(Some(datamatch as u64))),
361 _ => Err(SysError::new(EINVAL)),
362 },
363 PluginObject::Memory { slot, .. } => vm.remove_device_memory(slot).and(Ok(())),
364 PluginObject::IrqEvent { irq_id, evt } => vm.unregister_irqfd(&evt, irq_id),
365 }
366 }
367 }
368
run_vcpus( kvm: &Kvm, vm: &Vm, plugin: &Process, vcpu_count: u32, kill_signaled: &Arc<AtomicBool>, exit_evt: &EventFd, vcpu_handles: &mut Vec<thread::JoinHandle<()>>, ) -> Result<()>369 pub fn run_vcpus(
370 kvm: &Kvm,
371 vm: &Vm,
372 plugin: &Process,
373 vcpu_count: u32,
374 kill_signaled: &Arc<AtomicBool>,
375 exit_evt: &EventFd,
376 vcpu_handles: &mut Vec<thread::JoinHandle<()>>,
377 ) -> Result<()> {
378 let vcpu_thread_barrier = Arc::new(Barrier::new((vcpu_count) as usize));
379 for cpu_id in 0..vcpu_count {
380 let kill_signaled = kill_signaled.clone();
381 let vcpu_thread_barrier = vcpu_thread_barrier.clone();
382 let vcpu_exit_evt = exit_evt.try_clone().map_err(Error::CloneEventFd)?;
383 let vcpu_plugin = plugin.create_vcpu(cpu_id)?;
384 let vcpu = Vcpu::new(cpu_id as c_ulong, kvm, vm).map_err(Error::CreateVcpu)?;
385
386 vcpu_handles.push(
387 thread::Builder::new()
388 .name(format!("crosvm_vcpu{}", cpu_id))
389 .spawn(move || {
390 unsafe {
391 extern "C" fn handle_signal() {}
392 // Our signal handler does nothing and is trivially async signal safe.
393 // We need to install this signal handler even though we do block
394 // the signal below, to ensure that this signal will interrupt
395 // execution of KVM_RUN (this is implementation issue).
396 register_signal_handler(SIGRTMIN() + 0, handle_signal)
397 .expect("failed to register vcpu signal handler");
398 }
399
400 // We do not really want the signal handler to run...
401 block_signal(SIGRTMIN() + 0).expect("failed to block signal");
402 // Tell KVM to not block anything when entering kvm run
403 // because we will be using first RT signal to kick the VCPU.
404 vcpu.set_signal_mask(&[])
405 .expect("failed to set up KVM VCPU signal mask");
406
407 let res = vcpu_plugin.init(&vcpu);
408 vcpu_thread_barrier.wait();
409 if let Err(e) = res {
410 error!("failed to initialize vcpu {}: {}", cpu_id, e);
411 } else {
412 loop {
413 let mut interrupted_by_signal = false;
414 let run_res = vcpu.run();
415 match run_res {
416 Ok(run) => match run {
417 VcpuExit::IoIn { port, mut size } => {
418 let mut data = [0; 256];
419 if size > data.len() {
420 error!("unsupported IoIn size of {} bytes", size);
421 size = data.len();
422 }
423 vcpu_plugin.io_read(port as u64, &mut data[..size], &vcpu);
424 if let Err(e) = vcpu.set_data(&data[..size]) {
425 error!("failed to set return data for IoIn: {}", e);
426 }
427 }
428 VcpuExit::IoOut {
429 port,
430 mut size,
431 data,
432 } => {
433 if size > data.len() {
434 error!("unsupported IoOut size of {} bytes", size);
435 size = data.len();
436 }
437 vcpu_plugin.io_write(port as u64, &data[..size], &vcpu);
438 }
439 VcpuExit::MmioRead { address, size } => {
440 let mut data = [0; 8];
441 vcpu_plugin.mmio_read(
442 address as u64,
443 &mut data[..size],
444 &vcpu,
445 );
446 // Setting data for mmio can not fail.
447 let _ = vcpu.set_data(&data[..size]);
448 }
449 VcpuExit::MmioWrite {
450 address,
451 size,
452 data,
453 } => {
454 vcpu_plugin.mmio_write(
455 address as u64,
456 &data[..size],
457 &vcpu,
458 );
459 }
460 VcpuExit::Hlt => break,
461 VcpuExit::Shutdown => break,
462 VcpuExit::InternalError => {
463 error!("vcpu {} has internal error", cpu_id);
464 break;
465 }
466 r => warn!("unexpected vcpu exit: {:?}", r),
467 },
468 Err(e) => match e.errno() {
469 EINTR => interrupted_by_signal = true,
470 EAGAIN => {}
471 _ => {
472 error!("vcpu hit unknown error: {}", e);
473 break;
474 }
475 },
476 }
477 if kill_signaled.load(Ordering::SeqCst) {
478 break;
479 }
480
481 // Try to clear the signal that we use to kick VCPU if it is
482 // pending before attempting to handle pause requests.
483 if interrupted_by_signal {
484 clear_signal(SIGRTMIN() + 0)
485 .expect("failed to clear pending signal");
486 }
487
488 if let Err(e) = vcpu_plugin.pre_run(&vcpu) {
489 error!("failed to process pause on vcpu {}: {}", cpu_id, e);
490 break;
491 }
492 }
493 }
494 vcpu_exit_evt
495 .write(1)
496 .expect("failed to signal vcpu exit eventfd");
497 })
498 .map_err(Error::SpawnVcpu)?,
499 );
500 }
501 Ok(())
502 }
503
504 #[derive(PollToken)]
505 enum Token {
506 Exit,
507 ChildSignal,
508 Plugin { index: usize },
509 }
510
511 /// Run a VM with a plugin process specified by `cfg`.
512 ///
513 /// Not every field of `cfg` will be used. In particular, most field that pertain to a specific
514 /// device are ignored because the plugin is responsible for emulating hardware.
run_config(cfg: Config) -> Result<()>515 pub fn run_config(cfg: Config) -> Result<()> {
516 info!("crosvm starting plugin process");
517
518 // Masking signals is inherently dangerous, since this can persist across clones/execs. Do this
519 // before any jailed devices have been spawned, so that we can catch any of them that fail very
520 // quickly.
521 let sigchld_fd = SignalFd::new(SIGCHLD).map_err(Error::CreateSignalFd)?;
522
523 let jail = if cfg.sandbox {
524 // An empty directory for jailed plugin pivot root.
525 let root_path = match &cfg.plugin_root {
526 Some(dir) => dir,
527 None => Path::new("/var/empty"),
528 };
529
530 if root_path.is_relative() {
531 return Err(Error::RootNotAbsolute);
532 }
533
534 if !root_path.exists() {
535 return Err(Error::NoRootDir);
536 }
537
538 if !root_path.is_dir() {
539 return Err(Error::RootNotDir);
540 }
541
542 let policy_path = cfg.seccomp_policy_dir.join("plugin.policy");
543 let mut jail = create_plugin_jail(root_path, &policy_path)?;
544
545 // Update gid map of the jail if caller provided supplemental groups.
546 if !cfg.plugin_gid_maps.is_empty() {
547 let map = format!("0 {} 1", getegid())
548 + &cfg
549 .plugin_gid_maps
550 .into_iter()
551 .map(|m| format!(",{} {} {}", m.inner, m.outer, m.count))
552 .collect::<String>();
553 jail.gidmap(&map).map_err(Error::SetGidMap)?;
554 }
555
556 // Mount minimal set of devices (full, zero, urandom, etc). We can not use
557 // jail.mount_dev() here because crosvm may not be running with CAP_SYS_ADMIN.
558 let device_names = ["full", "null", "urandom", "zero"];
559 for name in &device_names {
560 let device = Path::new("/dev").join(&name);
561 jail.mount_bind(&device, &device, true)
562 .map_err(Error::MountDev)?;
563 }
564
565 for bind_mount in &cfg.plugin_mounts {
566 jail.mount_bind(&bind_mount.src, &bind_mount.dst, bind_mount.writable)
567 .map_err(Error::Mount)?;
568 }
569
570 Some(jail)
571 } else {
572 None
573 };
574
575 let mut tap_interfaces: Vec<Tap> = Vec::new();
576 if let Some(host_ip) = cfg.host_ip {
577 if let Some(netmask) = cfg.netmask {
578 if let Some(mac_address) = cfg.mac_address {
579 let tap = Tap::new(false).map_err(Error::TapOpen)?;
580 tap.set_ip_addr(host_ip).map_err(Error::TapSetIp)?;
581 tap.set_netmask(netmask).map_err(Error::TapSetNetmask)?;
582 tap.set_mac_address(mac_address)
583 .map_err(Error::TapSetMacAddress)?;
584
585 tap.enable().map_err(Error::TapEnable)?;
586 tap_interfaces.push(tap);
587 }
588 }
589 }
590 for tap_fd in cfg.tap_fd {
591 // Safe because we ensure that we get a unique handle to the fd.
592 let tap = unsafe {
593 Tap::from_raw_fd(validate_raw_fd(tap_fd).map_err(Error::ValidateTapFd)?)
594 .map_err(Error::CreateTapFd)?
595 };
596 tap_interfaces.push(tap);
597 }
598
599 let plugin_args: Vec<&str> = cfg.params.iter().map(|s| &s[..]).collect();
600
601 let plugin_path = match cfg.executable_path {
602 Some(Executable::Plugin(ref plugin_path)) => plugin_path.as_path(),
603 _ => panic!("Executable was not a plugin"),
604 };
605 let vcpu_count = cfg.vcpu_count.unwrap_or(1);
606 let mem = GuestMemory::new(&[]).unwrap();
607 let kvm = Kvm::new().map_err(Error::CreateKvm)?;
608 let mut vm = Vm::new(&kvm, mem).map_err(Error::CreateVm)?;
609 vm.create_irq_chip().map_err(Error::CreateIrqChip)?;
610 vm.create_pit().map_err(Error::CreatePIT)?;
611
612 let mut plugin = Process::new(vcpu_count, plugin_path, &plugin_args, jail)?;
613 // Now that the jail for the plugin has been created and we had a chance to adjust gids there,
614 // we can drop all our capabilities in case we had any.
615 drop_capabilities().map_err(Error::DropCapabilities)?;
616
617 let mut res = Ok(());
618 // If Some, we will exit after enough time is passed to shutdown cleanly.
619 let mut dying_instant: Option<Instant> = None;
620 let duration_to_die = Duration::from_millis(1000);
621
622 let exit_evt = EventFd::new().map_err(Error::CreateEventFd)?;
623 let kill_signaled = Arc::new(AtomicBool::new(false));
624 let mut vcpu_handles = Vec::with_capacity(vcpu_count as usize);
625
626 let poll_ctx = PollContext::new().map_err(Error::CreatePollContext)?;
627 poll_ctx
628 .add(&exit_evt, Token::Exit)
629 .map_err(Error::PollContextAdd)?;
630 poll_ctx
631 .add(&sigchld_fd, Token::ChildSignal)
632 .map_err(Error::PollContextAdd)?;
633
634 let mut sockets_to_drop = Vec::new();
635 let mut redo_poll_ctx_sockets = true;
636 // In this loop, make every attempt to not return early. If an error is encountered, set `res`
637 // to the error, set `dying_instant` to now, and signal the plugin that it will be killed soon.
638 // If the plugin cannot be signaled because it is dead of `signal_kill` failed, simply break
639 // from the poll loop so that the VCPU threads can be cleaned up.
640 'poll: loop {
641 // After we have waited long enough, it's time to give up and exit.
642 if dying_instant
643 .map(|i| i.elapsed() >= duration_to_die)
644 .unwrap_or(false)
645 {
646 break;
647 }
648
649 if redo_poll_ctx_sockets {
650 for (index, socket) in plugin.sockets().iter().enumerate() {
651 poll_ctx
652 .add(socket, Token::Plugin { index })
653 .map_err(Error::PollContextAdd)?;
654 }
655 }
656
657 let plugin_socket_count = plugin.sockets().len();
658 let events = {
659 let poll_res = match dying_instant {
660 Some(inst) => poll_ctx.wait_timeout(duration_to_die - inst.elapsed()),
661 None => poll_ctx.wait(),
662 };
663 match poll_res {
664 Ok(v) => v,
665 Err(e) => {
666 // Polling no longer works, time to break and cleanup,
667 if res.is_ok() {
668 res = Err(Error::Poll(e));
669 }
670 break;
671 }
672 }
673 };
674 for event in events.iter_readable() {
675 match event.token() {
676 Token::Exit => {
677 // No need to check the exit event if we are already doing cleanup.
678 let _ = poll_ctx.delete(&exit_evt);
679 dying_instant.get_or_insert(Instant::now());
680 let sig_res = plugin.signal_kill();
681 if res.is_ok() && sig_res.is_err() {
682 res = sig_res.map_err(Error::PluginKill);
683 }
684 }
685 Token::ChildSignal => {
686 // Print all available siginfo structs, then exit the loop.
687 loop {
688 match sigchld_fd.read() {
689 Ok(Some(siginfo)) => {
690 // If the plugin process has ended, there is no need to continue
691 // processing plugin connections, so we break early.
692 if siginfo.ssi_pid == plugin.pid() as u32 {
693 break 'poll;
694 }
695 // Because SIGCHLD is not expected from anything other than the
696 // plugin process, report it as an error.
697 if res.is_ok() {
698 res = Err(Error::SigChild {
699 pid: siginfo.ssi_pid,
700 signo: siginfo.ssi_signo,
701 status: siginfo.ssi_status,
702 code: siginfo.ssi_code,
703 })
704 }
705 }
706 Ok(None) => break, // No more signals to read.
707 Err(e) => {
708 // Something really must be messed up for this to happen, continue
709 // processing connections for a limited time.
710 if res.is_ok() {
711 res = Err(Error::SignalFd(e));
712 }
713 break;
714 }
715 }
716 }
717 // As we only spawn the plugin process, getting a SIGCHLD can only mean
718 // something went wrong.
719 dying_instant.get_or_insert(Instant::now());
720 let sig_res = plugin.signal_kill();
721 if res.is_ok() && sig_res.is_err() {
722 res = sig_res.map_err(Error::PluginKill);
723 }
724 }
725 Token::Plugin { index } => {
726 match plugin.handle_socket(index, &kvm, &mut vm, &vcpu_handles, &tap_interfaces)
727 {
728 Ok(_) => {}
729 // A HUP is an expected event for a socket, so don't bother warning about
730 // it.
731 Err(Error::PluginSocketHup) => sockets_to_drop.push(index),
732 // Only one connection out of potentially many is broken. Drop it, but don't
733 // start cleaning up. Because the error isn't returned, we will warn about
734 // it here.
735 Err(e) => {
736 warn!("error handling plugin socket: {}", e);
737 sockets_to_drop.push(index);
738 }
739 }
740 }
741 }
742 }
743
744 if vcpu_handles.is_empty() && dying_instant.is_none() && plugin.is_started() {
745 let res = run_vcpus(
746 &kvm,
747 &vm,
748 &plugin,
749 vcpu_count,
750 &kill_signaled,
751 &exit_evt,
752 &mut vcpu_handles,
753 );
754 if let Err(e) = res {
755 dying_instant.get_or_insert(Instant::now());
756 error!("failed to start vcpus: {}", e);
757 }
758 }
759
760 redo_poll_ctx_sockets =
761 !sockets_to_drop.is_empty() || plugin.sockets().len() != plugin_socket_count;
762
763 // Cleanup all of the sockets that we have determined were disconnected or suffered some
764 // other error.
765 plugin.drop_sockets(&mut sockets_to_drop);
766 sockets_to_drop.clear();
767
768 if redo_poll_ctx_sockets {
769 for socket in plugin.sockets() {
770 let _ = poll_ctx.delete(socket);
771 }
772 }
773 }
774
775 // vcpu threads MUST see the kill signaled flag, otherwise they may re-enter the VM.
776 kill_signaled.store(true, Ordering::SeqCst);
777 // Depending on how we ended up here, the plugin process, or a VCPU thread waiting for requests
778 // might be stuck. The `signal_kill` call will unstick all the VCPU threads by closing their
779 // blocked connections.
780 plugin.signal_kill().map_err(Error::PluginKill)?;
781 for handle in vcpu_handles {
782 match handle.kill(SIGRTMIN() + 0) {
783 Ok(_) => {
784 if let Err(e) = handle.join() {
785 error!("failed to join vcpu thread: {:?}", e);
786 }
787 }
788 Err(e) => error!("failed to kill vcpu thread: {}", e),
789 }
790 }
791
792 match plugin.try_wait() {
793 // The plugin has run out of time by now
794 Ok(ProcessStatus::Running) => Err(Error::PluginTimeout),
795 // Return an error discovered earlier in this function.
796 Ok(ProcessStatus::Success) => res,
797 Ok(ProcessStatus::Fail(code)) => Err(Error::PluginFailed(code)),
798 Ok(ProcessStatus::Signal(code)) => Err(Error::PluginKilled(code)),
799 Err(e) => Err(Error::PluginWait(e)),
800 }
801 }
802