1 // Copyright 2018 The ChromiumOS Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 mod process;
6 mod vcpu;
7
8 use std::fs::File;
9 use std::io;
10 use std::io::Read;
11 use std::io::Write;
12 use std::os::unix::net::UnixDatagram;
13 use std::path::Path;
14 use std::sync::atomic::AtomicBool;
15 use std::sync::atomic::Ordering;
16 use std::sync::Arc;
17 use std::sync::Barrier;
18 use std::thread;
19 use std::time::Duration;
20 use std::time::Instant;
21
22 use anyhow::anyhow;
23 use anyhow::bail;
24 use anyhow::Context;
25 use anyhow::Result;
26 use base::add_fd_flags;
27 use base::block_signal;
28 use base::clear_signal;
29 use base::drop_capabilities;
30 use base::enable_core_scheduling;
31 use base::error;
32 use base::getegid;
33 use base::geteuid;
34 use base::info;
35 use base::pipe;
36 use base::register_rt_signal_handler;
37 use base::validate_raw_descriptor;
38 use base::warn;
39 use base::AsRawDescriptor;
40 use base::Error as SysError;
41 use base::Event;
42 use base::EventToken;
43 use base::FromRawDescriptor;
44 use base::Killable;
45 use base::MmapError;
46 use base::RawDescriptor;
47 use base::Result as SysResult;
48 use base::SignalFd;
49 use base::WaitContext;
50 use base::SIGRTMIN;
51 use devices::virtio::NetParametersMode;
52 use jail::create_sandbox_minijail;
53 use jail::mount_proc;
54 use jail::SandboxConfig;
55 use kvm::Cap;
56 use kvm::Datamatch;
57 use kvm::IoeventAddress;
58 use kvm::Kvm;
59 use kvm::Vcpu;
60 use kvm::VcpuExit;
61 use kvm::Vm;
62 use libc::c_int;
63 use libc::c_ulong;
64 use libc::fcntl;
65 use libc::ioctl;
66 use libc::socketpair;
67 use libc::AF_UNIX;
68 use libc::EAGAIN;
69 use libc::EBADF;
70 use libc::EDEADLK;
71 use libc::EEXIST;
72 use libc::EINTR;
73 use libc::EINVAL;
74 use libc::ENOENT;
75 use libc::EOVERFLOW;
76 use libc::EPERM;
77 use libc::FIOCLEX;
78 use libc::F_SETPIPE_SZ;
79 use libc::O_NONBLOCK;
80 use libc::SIGCHLD;
81 use libc::SOCK_SEQPACKET;
82 use net_util::sys::unix::Tap;
83 use net_util::TapTCommon;
84 use protobuf::ProtobufError;
85 use remain::sorted;
86 use thiserror::Error;
87 use vm_memory::GuestMemory;
88 use vm_memory::MemoryPolicy;
89
90 use self::process::*;
91 use self::vcpu::*;
92 use crate::crosvm::config::Executable;
93 use crate::crosvm::config::HypervisorKind;
94 use crate::Config;
95
96 const MAX_DATAGRAM_SIZE: usize = 4096;
97 const MAX_VCPU_DATAGRAM_SIZE: usize = 0x40000;
98 const MAX_OPEN_FILES: u64 = 32768;
99
100 /// An error that occurs when communicating with the plugin process.
101 #[sorted]
102 #[derive(Error, Debug)]
103 pub enum CommError {
104 #[error("failed to decode plugin request: {0}")]
105 DecodeRequest(ProtobufError),
106 #[error("failed to encode plugin response: {0}")]
107 EncodeResponse(ProtobufError),
108 #[error("plugin request socket has been hung up")]
109 PluginSocketHup,
110 #[error("failed to recv from plugin request socket: {0}")]
111 PluginSocketRecv(SysError),
112 #[error("failed to send to plugin request socket: {0}")]
113 PluginSocketSend(SysError),
114 }
115
new_seqpacket_pair() -> SysResult<(UnixDatagram, UnixDatagram)>116 fn new_seqpacket_pair() -> SysResult<(UnixDatagram, UnixDatagram)> {
117 let mut fds = [0, 0];
118 unsafe {
119 let ret = socketpair(AF_UNIX, SOCK_SEQPACKET, 0, fds.as_mut_ptr());
120 if ret == 0 {
121 ioctl(fds[0], FIOCLEX);
122 Ok((
123 UnixDatagram::from_raw_descriptor(fds[0]),
124 UnixDatagram::from_raw_descriptor(fds[1]),
125 ))
126 } else {
127 Err(SysError::last())
128 }
129 }
130 }
131
132 struct VcpuPipe {
133 crosvm_read: File,
134 plugin_write: File,
135 plugin_read: File,
136 crosvm_write: File,
137 }
138
new_pipe_pair() -> SysResult<VcpuPipe>139 fn new_pipe_pair() -> SysResult<VcpuPipe> {
140 let to_crosvm = pipe(true)?;
141 let to_plugin = pipe(true)?;
142 // Increasing the pipe size can be a nice-to-have to make sure that
143 // messages get across atomically (and made sure that writes don't block),
144 // though it's not necessary a hard requirement for things to work.
145 let flags = unsafe {
146 fcntl(
147 to_crosvm.0.as_raw_descriptor(),
148 F_SETPIPE_SZ,
149 MAX_VCPU_DATAGRAM_SIZE as c_int,
150 )
151 };
152 if flags < 0 || flags != MAX_VCPU_DATAGRAM_SIZE as i32 {
153 warn!(
154 "Failed to adjust size of crosvm pipe (result {}): {}",
155 flags,
156 SysError::last()
157 );
158 }
159 let flags = unsafe {
160 fcntl(
161 to_plugin.0.as_raw_descriptor(),
162 F_SETPIPE_SZ,
163 MAX_VCPU_DATAGRAM_SIZE as c_int,
164 )
165 };
166 if flags < 0 || flags != MAX_VCPU_DATAGRAM_SIZE as i32 {
167 warn!(
168 "Failed to adjust size of plugin pipe (result {}): {}",
169 flags,
170 SysError::last()
171 );
172 }
173 Ok(VcpuPipe {
174 crosvm_read: to_crosvm.0,
175 plugin_write: to_crosvm.1,
176 plugin_read: to_plugin.0,
177 crosvm_write: to_plugin.1,
178 })
179 }
180
proto_to_sys_err(e: ProtobufError) -> SysError181 fn proto_to_sys_err(e: ProtobufError) -> SysError {
182 match e {
183 ProtobufError::IoError(e) => SysError::new(e.raw_os_error().unwrap_or(EINVAL)),
184 _ => SysError::new(EINVAL),
185 }
186 }
187
io_to_sys_err(e: io::Error) -> SysError188 fn io_to_sys_err(e: io::Error) -> SysError {
189 SysError::new(e.raw_os_error().unwrap_or(EINVAL))
190 }
191
mmap_to_sys_err(e: MmapError) -> SysError192 fn mmap_to_sys_err(e: MmapError) -> SysError {
193 match e {
194 MmapError::SystemCallFailed(e) => e,
195 _ => SysError::new(EINVAL),
196 }
197 }
198
199 /// Each `PluginObject` represents one object that was instantiated by the guest using the `Create`
200 /// request.
201 ///
202 /// Each such object has an ID associated with it that exists in an ID space shared by every variant
203 /// of `PluginObject`. This allows all the objects to be indexed in a single map, and allows for a
204 /// common destroy method.
205 ///
206
207 /// In addition to the destory method, each object may have methods specific to its variant type.
208 /// These variant methods must be done by matching the variant to the expected type for that method.
209 /// For example, getting the dirty log from a `Memory` object starting with an ID:
210 ///
211 /// ```ignore
212 /// match objects.get(&request_id) {
213 /// Some(&PluginObject::Memory { slot, length }) => vm.get_dirty_log(slot, &mut dirty_log[..]),
214 /// _ => return Err(SysError::new(ENOENT)),
215 /// }
216 /// ```
217 enum PluginObject {
218 IoEvent {
219 evt: Event,
220 addr: IoeventAddress,
221 length: u32,
222 datamatch: u64,
223 },
224 Memory {
225 slot: u32,
226 length: usize,
227 },
228 IrqEvent {
229 irq_id: u32,
230 evt: Event,
231 },
232 }
233
234 impl PluginObject {
destroy(self, vm: &mut Vm) -> SysResult<()>235 fn destroy(self, vm: &mut Vm) -> SysResult<()> {
236 match self {
237 PluginObject::IoEvent {
238 evt,
239 addr,
240 length,
241 datamatch,
242 } => match length {
243 0 => vm.unregister_ioevent(&evt, addr, Datamatch::AnyLength),
244 1 => vm.unregister_ioevent(&evt, addr, Datamatch::U8(Some(datamatch as u8))),
245 2 => vm.unregister_ioevent(&evt, addr, Datamatch::U16(Some(datamatch as u16))),
246 4 => vm.unregister_ioevent(&evt, addr, Datamatch::U32(Some(datamatch as u32))),
247 8 => vm.unregister_ioevent(&evt, addr, Datamatch::U64(Some(datamatch as u64))),
248 _ => Err(SysError::new(EINVAL)),
249 },
250 PluginObject::Memory { slot, .. } => vm.remove_memory_region(slot).and(Ok(())),
251 PluginObject::IrqEvent { irq_id, evt } => vm.unregister_irqfd(&evt, irq_id),
252 }
253 }
254 }
255
run_vcpus( kvm: &Kvm, vm: &Vm, plugin: &Process, vcpu_count: u32, kill_signaled: &Arc<AtomicBool>, exit_evt: &Event, vcpu_handles: &mut Vec<thread::JoinHandle<()>>, vcpu_cgroup_tasks_file: Option<File>, ) -> Result<()>256 pub fn run_vcpus(
257 kvm: &Kvm,
258 vm: &Vm,
259 plugin: &Process,
260 vcpu_count: u32,
261 kill_signaled: &Arc<AtomicBool>,
262 exit_evt: &Event,
263 vcpu_handles: &mut Vec<thread::JoinHandle<()>>,
264 vcpu_cgroup_tasks_file: Option<File>,
265 ) -> Result<()> {
266 let vcpu_thread_barrier = Arc::new(Barrier::new((vcpu_count) as usize));
267 let use_kvm_signals = !kvm.check_extension(Cap::ImmediateExit);
268
269 // If we need to force a vcpu to exit from a VM then a SIGRTMIN signal is sent
270 // to that vcpu's thread. If KVM is running the VM then it'll return -EINTR.
271 // An issue is what to do when KVM isn't running the VM (where we could be
272 // in the kernel or in the app).
273 //
274 // If KVM supports "immediate exit" then we set a signal handler that will
275 // set the |immediate_exit| flag that tells KVM to return -EINTR before running
276 // the VM.
277 //
278 // If KVM doesn't support immediate exit then we'll block SIGRTMIN in the app
279 // and tell KVM to unblock SIGRTMIN before running the VM (at which point a blocked
280 // signal might get asserted). There's overhead to have KVM unblock and re-block
281 // SIGRTMIN each time it runs the VM, so this mode should be avoided.
282
283 if use_kvm_signals {
284 unsafe {
285 extern "C" fn handle_signal(_: c_int) {}
286 // Our signal handler does nothing and is trivially async signal safe.
287 // We need to install this signal handler even though we do block
288 // the signal below, to ensure that this signal will interrupt
289 // execution of KVM_RUN (this is implementation issue).
290 register_rt_signal_handler(SIGRTMIN() + 0, handle_signal)
291 .expect("failed to register vcpu signal handler");
292 }
293 // We do not really want the signal handler to run...
294 block_signal(SIGRTMIN() + 0).expect("failed to block signal");
295 } else {
296 unsafe {
297 extern "C" fn handle_signal(_: c_int) {
298 Vcpu::set_local_immediate_exit(true);
299 }
300 register_rt_signal_handler(SIGRTMIN() + 0, handle_signal)
301 .expect("failed to register vcpu signal handler");
302 }
303 }
304
305 for cpu_id in 0..vcpu_count {
306 let kill_signaled = kill_signaled.clone();
307 let vcpu_thread_barrier = vcpu_thread_barrier.clone();
308 let vcpu_exit_evt = exit_evt.try_clone().context("failed to clone event")?;
309 let vcpu_plugin = plugin.create_vcpu(cpu_id)?;
310 let vcpu = Vcpu::new(cpu_id as c_ulong, kvm, vm).context("error creating vcpu")?;
311 let vcpu_cgroup_tasks_file = vcpu_cgroup_tasks_file
312 .as_ref()
313 .map(|f| f.try_clone().unwrap());
314
315 vcpu_handles.push(
316 thread::Builder::new()
317 .name(format!("crosvm_vcpu{}", cpu_id))
318 .spawn(move || {
319 if use_kvm_signals {
320 // Tell KVM to not block anything when entering kvm run
321 // because we will be using first RT signal to kick the VCPU.
322 vcpu.set_signal_mask(&[])
323 .expect("failed to set up KVM VCPU signal mask");
324 }
325 // Move vcpu thread to cgroup
326 if let Some(mut f) = vcpu_cgroup_tasks_file {
327 f.write_all(base::gettid().to_string().as_bytes()).unwrap();
328 }
329
330 if let Err(e) = enable_core_scheduling() {
331 error!("Failed to enable core scheduling: {}", e);
332 }
333
334 let vcpu = vcpu
335 .to_runnable(Some(SIGRTMIN() + 0))
336 .expect("Failed to set thread id");
337
338 let res = vcpu_plugin.init(&vcpu);
339 vcpu_thread_barrier.wait();
340 if let Err(e) = res {
341 error!("failed to initialize vcpu {}: {}", cpu_id, e);
342 } else {
343 loop {
344 let mut interrupted_by_signal = false;
345 let run_res = vcpu.run();
346 match run_res {
347 Ok(run) => match run {
348 VcpuExit::IoIn { port, mut size } => {
349 let mut data = [0; 256];
350 if size > data.len() {
351 error!(
352 "unsupported IoIn size of {} bytes at port {:#x}",
353 size, port
354 );
355 size = data.len();
356 }
357 vcpu_plugin.io_read(port as u64, &mut data[..size], &vcpu);
358 if let Err(e) = vcpu.set_data(&data[..size]) {
359 error!(
360 "failed to set return data for IoIn at port {:#x}: {}",
361 port, e
362 );
363 }
364 }
365 VcpuExit::IoOut {
366 port,
367 mut size,
368 data,
369 } => {
370 if size > data.len() {
371 error!("unsupported IoOut size of {} bytes at port {:#x}", size, port);
372 size = data.len();
373 }
374 vcpu_plugin.io_write(port as u64, &data[..size], &vcpu);
375 }
376 VcpuExit::MmioRead { address, size } => {
377 let mut data = [0; 8];
378 vcpu_plugin.mmio_read(
379 address as u64,
380 &mut data[..size],
381 &vcpu,
382 );
383 // Setting data for mmio can not fail.
384 let _ = vcpu.set_data(&data[..size]);
385 }
386 VcpuExit::MmioWrite {
387 address,
388 size,
389 data,
390 } => {
391 vcpu_plugin.mmio_write(
392 address as u64,
393 &data[..size],
394 &vcpu,
395 );
396 }
397 VcpuExit::HypervHcall { input, params } => {
398 let mut data = [0; 8];
399 vcpu_plugin.hyperv_call(input, params, &mut data, &vcpu);
400 // Setting data for hyperv call can not fail.
401 let _ = vcpu.set_data(&data);
402 }
403 VcpuExit::HypervSynic {
404 msr,
405 control,
406 evt_page,
407 msg_page,
408 } => {
409 vcpu_plugin
410 .hyperv_synic(msr, control, evt_page, msg_page, &vcpu);
411 }
412 VcpuExit::Hlt => break,
413 VcpuExit::Shutdown => break,
414 VcpuExit::InternalError => {
415 error!("vcpu {} has internal error", cpu_id);
416 break;
417 }
418 r => warn!("unexpected vcpu exit: {:?}", r),
419 },
420 Err(e) => match e.errno() {
421 EINTR => interrupted_by_signal = true,
422 EAGAIN => {}
423 _ => {
424 error!("vcpu hit unknown error: {}", e);
425 break;
426 }
427 },
428 }
429 if kill_signaled.load(Ordering::SeqCst) {
430 break;
431 }
432
433 // Only handle the pause request if kvm reported that it was
434 // interrupted by a signal. This helps to entire that KVM has had a chance
435 // to finish emulating any IO that may have immediately happened.
436 // If we eagerly check pre_run() then any IO that we
437 // just reported to the plugin won't have been processed yet by KVM.
438 // Not eagerly calling pre_run() also helps to reduce
439 // any overhead from checking if a pause request is pending.
440 // The assumption is that pause requests aren't common
441 // or frequent so it's better to optimize for the non-pause execution paths.
442 if interrupted_by_signal {
443 if use_kvm_signals {
444 clear_signal(SIGRTMIN() + 0)
445 .expect("failed to clear pending signal");
446 } else {
447 vcpu.set_immediate_exit(false);
448 }
449
450 if let Err(e) = vcpu_plugin.pre_run(&vcpu) {
451 error!("failed to process pause on vcpu {}: {}", cpu_id, e);
452 break;
453 }
454 }
455 }
456 }
457 vcpu_exit_evt
458 .signal()
459 .expect("failed to signal vcpu exit event");
460 })
461 .context("error spawning vcpu thread")?,
462 );
463 }
464 Ok(())
465 }
466
467 #[derive(EventToken)]
468 enum Token {
469 Exit,
470 ChildSignal,
471 Stderr,
472 Plugin { index: usize },
473 }
474
475 /// Run a VM with a plugin process specified by `cfg`.
476 ///
477 /// Not every field of `cfg` will be used. In particular, most field that pertain to a specific
478 /// device are ignored because the plugin is responsible for emulating hardware.
run_config(cfg: Config) -> Result<()>479 pub fn run_config(cfg: Config) -> Result<()> {
480 info!("crosvm starting plugin process");
481
482 // Masking signals is inherently dangerous, since this can persist across clones/execs. Do this
483 // before any jailed devices have been spawned, so that we can catch any of them that fail very
484 // quickly.
485 let sigchld_fd = SignalFd::new(SIGCHLD).context("failed to create signalfd")?;
486
487 // Create a pipe to capture error messages from plugin and minijail.
488 let (mut stderr_rd, stderr_wr) = pipe(true).context("failed to create stderr pipe")?;
489 add_fd_flags(stderr_rd.as_raw_descriptor(), O_NONBLOCK)
490 .context("error marking stderr nonblocking")?;
491
492 let jail = if let Some(jail_config) = &cfg.jail_config {
493 if jail_config.seccomp_policy_dir.is_none() {
494 bail!("plugin requires seccomp policy file specified.");
495 }
496
497 let mut config = SandboxConfig::new(jail_config, "plugin");
498 config.bind_mounts = true;
499 let uid_map = format!("0 {} 1", geteuid());
500 let gid_map = format!("0 {} 1", getegid());
501 let gid_map = if cfg.plugin_gid_maps.len() > 0 {
502 gid_map
503 + &cfg
504 .plugin_gid_maps
505 .into_iter()
506 .map(|m| format!(",{} {} {}", m.inner, m.outer, m.count))
507 .collect::<String>()
508 } else {
509 gid_map
510 };
511 config.ugid_map = Some((&uid_map, &gid_map));
512
513 let root_path = cfg.plugin_root.as_ref().unwrap_or(&jail_config.pivot_root);
514 let mut jail = create_sandbox_minijail(root_path, MAX_OPEN_FILES, &config)
515 .context("create plugin sandbox")?;
516
517 // Because we requested to "run as init", minijail will not mount /proc for us even though
518 // plugin will be running in its own PID namespace, so we have to mount it ourselves.
519 mount_proc(&mut jail).context("mount proc")?;
520
521 // Mount minimal set of devices (full, zero, urandom, etc). We can not use
522 // jail.mount_dev() here because crosvm may not be running with CAP_SYS_ADMIN.
523 let device_names = ["full", "null", "urandom", "zero"];
524 for name in &device_names {
525 let device = Path::new("/dev").join(name);
526 jail.mount_bind(&device, &device, true)
527 .context("failed to mount dev")?;
528 }
529
530 for bind_mount in &cfg.plugin_mounts {
531 jail.mount_bind(&bind_mount.src, &bind_mount.dst, bind_mount.writable)
532 .with_context(|| {
533 format!(
534 "failed to bind mount {} -> {} as {} ",
535 bind_mount.src.display(),
536 bind_mount.dst.display(),
537 if bind_mount.writable {
538 "writable"
539 } else {
540 "read only"
541 }
542 )
543 })?;
544 }
545
546 Some(jail)
547 } else {
548 None
549 };
550
551 let mut tap_interfaces: Vec<Tap> = Vec::new();
552 for net_params in cfg.net {
553 if net_params.vhost_net {
554 bail!("vhost-net not supported with plugin");
555 }
556
557 match net_params.mode {
558 NetParametersMode::RawConfig {
559 host_ip,
560 netmask,
561 mac,
562 } => {
563 let tap = Tap::new(false, false).context("error opening tap device")?;
564 tap.set_ip_addr(host_ip).context("error setting tap ip")?;
565 tap.set_netmask(netmask)
566 .context("error setting tap netmask")?;
567 tap.set_mac_address(mac)
568 .context("error setting tap mac address")?;
569
570 tap.enable().context("error enabling tap device")?;
571 tap_interfaces.push(tap);
572 }
573 NetParametersMode::TapName { tap_name, mac } => {
574 let tap = Tap::new_with_name(tap_name.as_bytes(), true, false)
575 .context("failed to create tap device from name")?;
576 if let Some(mac) = mac {
577 tap.set_mac_address(mac)
578 .context("error setting tap mac addres")?;
579 }
580 tap_interfaces.push(tap);
581 }
582 NetParametersMode::TapFd { tap_fd, mac } => {
583 // Safe because we ensure that we get a unique handle to the fd.
584 let tap = unsafe {
585 Tap::from_raw_descriptor(
586 validate_raw_descriptor(tap_fd).context("failed to validate raw tap fd")?,
587 )
588 .context("failed to create tap device from raw fd")?
589 };
590 if let Some(mac) = mac {
591 tap.set_mac_address(mac)
592 .context("error setting tap mac addres")?;
593 }
594 tap_interfaces.push(tap);
595 }
596 }
597 }
598
599 let plugin_args: Vec<&str> = cfg.params.iter().map(|s| &s[..]).collect();
600
601 let plugin_path = match cfg.executable_path {
602 Some(Executable::Plugin(ref plugin_path)) => plugin_path.as_path(),
603 _ => panic!("Executable was not a plugin"),
604 };
605 let vcpu_count = cfg.vcpu_count.unwrap_or(1) as u32;
606 let mem = GuestMemory::new(&[]).unwrap();
607 let mut mem_policy = MemoryPolicy::empty();
608 if cfg.hugepages {
609 mem_policy |= MemoryPolicy::USE_HUGEPAGES;
610 }
611 mem.set_memory_policy(mem_policy);
612
613 let kvm_device_path = if let Some(HypervisorKind::Kvm { device }) = &cfg.hypervisor {
614 device.as_deref()
615 } else {
616 None
617 };
618
619 let kvm_device_path = kvm_device_path.unwrap_or(Path::new("/dev/kvm"));
620 let kvm = Kvm::new_with_path(kvm_device_path).context("error creating Kvm")?;
621 let mut vm = Vm::new(&kvm, mem).context("error creating vm")?;
622 vm.create_irq_chip()
623 .context("failed to create kvm irqchip")?;
624 vm.create_pit().context("failed to create kvm PIT")?;
625
626 let mut plugin = Process::new(vcpu_count, plugin_path, &plugin_args, jail, stderr_wr)?;
627 // Now that the jail for the plugin has been created and we had a chance to adjust gids there,
628 // we can drop all our capabilities in case we had any.
629 drop_capabilities().context("failed to drop process capabilities")?;
630
631 let mut res = Ok(());
632 // If Some, we will exit after enough time is passed to shutdown cleanly.
633 let mut dying_instant: Option<Instant> = None;
634 let duration_to_die = Duration::from_millis(1000);
635
636 let exit_evt = Event::new().context("failed to create event")?;
637 let kill_signaled = Arc::new(AtomicBool::new(false));
638 let mut vcpu_handles = Vec::with_capacity(vcpu_count as usize);
639
640 let wait_ctx = WaitContext::build_with(&[
641 (&exit_evt, Token::Exit),
642 (&sigchld_fd, Token::ChildSignal),
643 (&stderr_rd, Token::Stderr),
644 ])
645 .context("failed to add control descriptors to wait context")?;
646
647 let mut sockets_to_drop = Vec::new();
648 let mut redo_wait_ctx_sockets = true;
649 // In this loop, make every attempt to not return early. If an error is encountered, set `res`
650 // to the error, set `dying_instant` to now, and signal the plugin that it will be killed soon.
651 // If the plugin cannot be signaled because it is dead of `signal_kill` failed, simply break
652 // from the poll loop so that the VCPU threads can be cleaned up.
653 'wait: loop {
654 // After we have waited long enough, it's time to give up and exit.
655 if dying_instant
656 .map(|i| i.elapsed() >= duration_to_die)
657 .unwrap_or(false)
658 {
659 break;
660 }
661
662 if redo_wait_ctx_sockets {
663 for (index, socket) in plugin.sockets().iter().enumerate() {
664 wait_ctx
665 .add(socket, Token::Plugin { index })
666 .context("failed to add plugin sockets to wait context")?;
667 }
668 }
669
670 let plugin_socket_count = plugin.sockets().len();
671 let events = {
672 let poll_res = match dying_instant {
673 Some(inst) => wait_ctx.wait_timeout(duration_to_die - inst.elapsed()),
674 None => wait_ctx.wait(),
675 };
676 match poll_res {
677 Ok(v) => v,
678 Err(e) => {
679 // Polling no longer works, time to break and cleanup,
680 if res.is_ok() {
681 res = Err(e).context("failed to poll all FDs");
682 }
683 break;
684 }
685 }
686 };
687
688 for event in events.iter().filter(|e| e.is_hungup) {
689 if let Token::Stderr = event.token {
690 let _ = wait_ctx.delete(&stderr_rd);
691 }
692 }
693
694 for event in events.iter().filter(|e| e.is_readable) {
695 match event.token {
696 Token::Exit => {
697 // No need to check the exit event if we are already doing cleanup.
698 let _ = wait_ctx.delete(&exit_evt);
699 dying_instant.get_or_insert(Instant::now());
700 let sig_res = plugin.signal_kill();
701 if res.is_ok() && sig_res.is_err() {
702 res = sig_res.context("error sending kill signal to plugin on exit event");
703 }
704 }
705 Token::ChildSignal => {
706 // Print all available siginfo structs, then exit the loop.
707 loop {
708 match sigchld_fd.read() {
709 Ok(Some(siginfo)) => {
710 // If the plugin process has ended, there is no need to continue
711 // processing plugin connections, so we break early.
712 if siginfo.ssi_pid == plugin.pid() as u32 {
713 break 'wait;
714 }
715 // Because SIGCHLD is not expected from anything other than the
716 // plugin process, report it as an error.
717 if res.is_ok() {
718 res = Err(anyhow!(
719 "process {} died with signal {}, status {}, and code {}",
720 siginfo.ssi_pid,
721 siginfo.ssi_signo,
722 siginfo.ssi_status,
723 siginfo.ssi_code,
724 ));
725 }
726 }
727 Ok(None) => break, // No more signals to read.
728 Err(e) => {
729 // Something really must be messed up for this to happen, continue
730 // processing connections for a limited time.
731 if res.is_ok() {
732 res = Err(e).context("failed to read signal fd");
733 }
734 break;
735 }
736 }
737 }
738 // As we only spawn the plugin process, getting a SIGCHLD can only mean
739 // something went wrong.
740 dying_instant.get_or_insert(Instant::now());
741 let sig_res = plugin.signal_kill();
742 if res.is_ok() && sig_res.is_err() {
743 res = sig_res.context("error sending kill signal to plugin on SIGCHLD");
744 }
745 }
746 Token::Stderr => loop {
747 let mut buf = [0u8; 4096];
748 match stderr_rd.read(&mut buf) {
749 Ok(len) => {
750 for l in String::from_utf8_lossy(&buf[0..len]).lines() {
751 error!("minijail/plugin: {}", l);
752 }
753 }
754 Err(ref e) if e.kind() == io::ErrorKind::WouldBlock => {
755 break;
756 }
757 Err(e) => {
758 error!("failed reading from stderr: {}", e);
759 break;
760 }
761 }
762 },
763 Token::Plugin { index } => {
764 match plugin.handle_socket(index, &kvm, &mut vm, &vcpu_handles, &tap_interfaces)
765 {
766 Ok(_) => {}
767 // A HUP is an expected event for a socket, so don't bother warning about
768 // it.
769 Err(CommError::PluginSocketHup) => sockets_to_drop.push(index),
770 // Only one connection out of potentially many is broken. Drop it, but don't
771 // start cleaning up. Because the error isn't returned, we will warn about
772 // it here.
773 Err(e) => {
774 warn!("error handling plugin socket: {}", e);
775 sockets_to_drop.push(index);
776 }
777 }
778 }
779 }
780 }
781
782 if vcpu_handles.is_empty() && dying_instant.is_none() && plugin.is_started() {
783 let vcpu_cgroup_tasks_file = match &cfg.vcpu_cgroup_path {
784 None => None,
785 Some(cgroup_path) => {
786 // Move main process to cgroup_path
787 let mut f = File::create(&cgroup_path.join("tasks"))?;
788 f.write_all(std::process::id().to_string().as_bytes())?;
789 Some(f)
790 }
791 };
792
793 let res = run_vcpus(
794 &kvm,
795 &vm,
796 &plugin,
797 vcpu_count,
798 &kill_signaled,
799 &exit_evt,
800 &mut vcpu_handles,
801 vcpu_cgroup_tasks_file,
802 );
803 if let Err(e) = res {
804 dying_instant.get_or_insert(Instant::now());
805 error!("failed to start vcpus: {}", e);
806 }
807 }
808
809 redo_wait_ctx_sockets =
810 !sockets_to_drop.is_empty() || plugin.sockets().len() != plugin_socket_count;
811
812 // Cleanup all of the sockets that we have determined were disconnected or suffered some
813 // other error.
814 plugin.drop_sockets(&mut sockets_to_drop);
815 sockets_to_drop.clear();
816
817 if redo_wait_ctx_sockets {
818 for socket in plugin.sockets() {
819 let _ = wait_ctx.delete(socket);
820 }
821 }
822 }
823
824 // vcpu threads MUST see the kill signaled flag, otherwise they may re-enter the VM.
825 kill_signaled.store(true, Ordering::SeqCst);
826 // Depending on how we ended up here, the plugin process, or a VCPU thread waiting for requests
827 // might be stuck. The `signal_kill` call will unstick all the VCPU threads by closing their
828 // blocked connections.
829 plugin
830 .signal_kill()
831 .context("error sending kill signal to plugin on cleanup")?;
832 for handle in vcpu_handles {
833 match handle.kill(SIGRTMIN() + 0) {
834 Ok(_) => {
835 if let Err(e) = handle.join() {
836 error!("failed to join vcpu thread: {:?}", e);
837 }
838 }
839 Err(e) => error!("failed to kill vcpu thread: {}", e),
840 }
841 }
842
843 match plugin.try_wait() {
844 // The plugin has run out of time by now
845 Ok(ProcessStatus::Running) => Err(anyhow!("plugin did not exit within timeout")),
846 // Return an error discovered earlier in this function.
847 Ok(ProcessStatus::Success) => res.map_err(anyhow::Error::msg),
848 Ok(ProcessStatus::Fail(code)) => Err(anyhow!("plugin exited with error: {}", code)),
849 Ok(ProcessStatus::Signal(code)) => Err(anyhow!("plugin exited with signal {}", code)),
850 Err(e) => Err(anyhow!("error waiting for plugin to exit: {}", e)),
851 }
852 }
853