jail/src/helpers.rs

// Copyright 2017 The ChromiumOS Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#![deny(missing_docs)]
#![allow(dead_code)]

use std::path::Path;
use std::str;

use anyhow::bail;
use anyhow::Context;
use anyhow::Result;
#[cfg(feature = "seccomp_trace")]
use base::debug;
use base::getegid;
use base::geteuid;
#[cfg(feature = "seccomp_trace")]
use base::warn;
use libc::c_ulong;
use minijail::Minijail;
use once_cell::sync::Lazy;
#[cfg(feature = "seccomp_trace")]
use static_assertions::const_assert;
#[cfg(feature = "seccomp_trace")]
use zerocopy::Immutable;
#[cfg(feature = "seccomp_trace")]
use zerocopy::IntoBytes;

use crate::config::JailConfig;

// ANDROID: b/246968493
#[cfg(not(feature = "seccomp_trace"))]
static EMBEDDED_BPFS: Lazy<std::collections::HashMap<&str, Vec<u8>>> =
    Lazy::new(|| std::collections::HashMap::<&str, Vec<u8>>::new());

/// Most devices don't need to open many fds.
pub const MAX_OPEN_FILES_DEFAULT: u64 = 1024;
/// The max open files for gpu processes.
const MAX_OPEN_FILES_FOR_GPU: u64 = 32768;
/// The max open files for jail warden, matching FD_RAW_FAILURE.
pub const MAX_OPEN_FILES_FOR_JAIL_WARDEN: u64 = 65536;

/// The user in the jail to run as.
pub enum RunAsUser {
    /// Do not specify the user
    Unspecified,
    /// Runs as the same user in the jail as the current user.
    CurrentUser,
    /// Runs as the root user in the jail.
    Root,
    /// Runs as the specified uid and gid.
    /// This requires `SandboxConfig::ugid_map` to be set.
    Specified(u32, u32),
}

/// Config for the sandbox to be created by [Minijail].
pub struct SandboxConfig<'a> {
    /// Whether or not to drop all capabilities in the sandbox.
    pub limit_caps: bool,
    log_failures: bool,
    seccomp_policy_dir: Option<&'a Path>,
    seccomp_policy_name: &'a str,
    /// The pair of `uid_map` and `gid_map`.
    pub ugid_map: Option<(&'a str, &'a str)>,
    /// The remount mode instead of default MS_PRIVATE.
    pub remount_mode: Option<c_ulong>,
    /// Whether to use empty net namespace. Enabled by default.
    pub namespace_net: bool,
    /// Whether or not to configure the jail to support bind-mounts.
    ///
    /// Note that most device processes deny `open(2)` and `openat(2)` by seccomp policy and just
    /// returns `ENOENT`. Passing opened file descriptors is recommended over opening files in the
    /// sandbox.
    pub bind_mounts: bool,
    /// Specify the user in the jail to run as.
    pub run_as: RunAsUser,
}

impl<'a> SandboxConfig<'a> {
    /// Creates [SandboxConfig].
    pub fn new(jail_config: &'a JailConfig, policy: &'a str) -> Self {
        Self {
            limit_caps: true,
            log_failures: jail_config.seccomp_log_failures,
            seccomp_policy_dir: jail_config.seccomp_policy_dir.as_ref().map(Path::new),
            seccomp_policy_name: policy,
            ugid_map: None,
            remount_mode: None,
            namespace_net: true,
            bind_mounts: false,
            run_as: RunAsUser::Unspecified,
        }
    }
}

/// Wrapper that cleans up a [Minijail] when it is dropped
pub struct ScopedMinijail(pub Minijail);

impl Drop for ScopedMinijail {
    fn drop(&mut self) {
        let _ = self.0.kill();
    }
}

/// Creates a default Minijail instance with no configuration.
pub fn create_default_minijail() -> minijail::Result<Minijail> {
    Minijail::new().map(|mut jail| {
        // Temporarily disable multithreaded check due to a regression in linux 6.12.5
        // TODO(b/395899741): Remove after kernel upstream is fixed.
        jail.disable_multithreaded_check();
        jail
    })
}

/// Creates a [Minijail] instance which just changes the root using pivot_root(2) path and
/// `max_open_files` using `RLIMIT_NOFILE`.
///
/// If `root` path is "/", the minijail don't change the root.
///
/// # Arguments
///
/// * `root` - The root path to be changed to by minijail.
/// * `max_open_files` - The maximum number of file descriptors to allow a jailed process to open.
#[allow(clippy::unnecessary_cast)]
pub fn create_base_minijail(root: &Path, max_open_files: u64) -> Result<Minijail> {
    // Validate new root directory. Path::is_dir() also checks the existence.
    if !root.is_dir() {
        bail!("{:?} is not a directory, cannot create jail", root);
    }
    // chroot accepts absolute path only.
    if !root.is_absolute() {
        bail!("{:?} is not absolute path", root);
    }

    let mut jail = create_default_minijail().context("failed to jail device")?;

    // Only pivot_root if we are not re-using the current root directory.
    if root != Path::new("/") {
        // Run in a new mount namespace.
        jail.namespace_vfs();
        jail.enter_pivot_root(root)
            .context("failed to pivot root device")?;
    }

    jail.set_rlimit(libc::RLIMIT_NOFILE as i32, max_open_files, max_open_files)
        .context("error setting max open files")?;

    Ok(jail)
}

/// Creates a [Minijail] instance which just invokes a jail process and sets
/// `max_open_files` using `RLIMIT_NOFILE`. This is helpful with crosvm process
/// runs as a non-root user without SYS_ADMIN capabilities.
///
/// Unlike `create_base_minijail`, this function doesn't call `pivot_root`
/// and `mount namespace`. So, it runs as a non-root user without
/// SYS_ADMIN capabilities.
///
/// Note that since there is no file system isolation provided by this function,
/// caller of this function should enforce other security mechanisum such as selinux
/// on the host to protect directories.
///
/// # Arguments
///
/// * `root` - The root path to checked before the process is jailed
/// * `max_open_files` - The maximum number of file descriptors to allow a jailed process to open.
#[allow(clippy::unnecessary_cast)]
pub fn create_base_minijail_without_pivot_root(
    root: &Path,
    max_open_files: u64,
) -> Result<Minijail> {
    // Validate new root directory. Path::is_dir() also checks the existence.
    if !root.is_dir() {
        bail!("{:?} is not a directory, cannot create jail", root);
    }
    if !root.is_absolute() {
        bail!("{:?} is not absolute path", root);
    }

    let mut jail = create_default_minijail().context("failed to jail device")?;
    jail.set_rlimit(libc::RLIMIT_NOFILE as i32, max_open_files, max_open_files)
        .context("error setting max open files")?;

    Ok(jail)
}

/// Creates a [Minijail] instance which creates a sandbox.
///
/// # Arguments
///
/// * `root` - The root path to be changed to by minijail.
/// * `max_open_files` - The maximum number of file descriptors to allow a jailed process to open.
/// * `config` - The [SandboxConfig] to control details of the sandbox.
pub fn create_sandbox_minijail(
    root: &Path,
    max_open_files: u64,
    config: &SandboxConfig,
) -> Result<Minijail> {
    let mut jail = create_base_minijail(root, max_open_files)?;

    jail.namespace_pids();
    jail.namespace_user();
    jail.namespace_user_disable_setgroups();
    if config.limit_caps {
        // Don't need any capabilities.
        jail.use_caps(0);
    }
    match config.run_as {
        RunAsUser::Unspecified => {
            if config.bind_mounts && config.ugid_map.is_none() {
                // Minijail requires to set user/group map to mount extra directories.
                add_current_user_to_jail(&mut jail)?;
            }
        }
        RunAsUser::CurrentUser => {
            add_current_user_to_jail(&mut jail)?;
        }
        RunAsUser::Root => {
            // Add the current user as root in the jail.
            let crosvm_uid = geteuid();
            let crosvm_gid = getegid();
            jail.uidmap(&format!("0 {} 1", crosvm_uid))
                .context("error setting UID map")?;
            jail.gidmap(&format!("0 {} 1", crosvm_gid))
                .context("error setting GID map")?;
        }
        RunAsUser::Specified(uid, gid) => {
            if uid != 0 {
                jail.change_uid(uid)
            }
            if gid != 0 {
                jail.change_gid(gid)
            }
        }
    }
    if config.bind_mounts {
        // Create a tmpfs in the device's root directory so that we can bind mount files.
        // The size=67108864 is size=64*1024*1024 or size=64MB.
        // TODO(b/267581374): Use appropriate size for tmpfs.
        jail.mount_with_data(
            Path::new("none"),
            Path::new("/"),
            "tmpfs",
            (libc::MS_NOSUID | libc::MS_NODEV | libc::MS_NOEXEC) as usize,
            "size=67108864",
        )?;
    }
    if let Some((uid_map, gid_map)) = config.ugid_map {
        jail.uidmap(uid_map).context("error setting UID map")?;
        jail.gidmap(gid_map).context("error setting GID map")?;
    }
    // Run in a new mount namespace.
    jail.namespace_vfs();

    if config.namespace_net {
        // Run in an empty network namespace.
        jail.namespace_net();
    }

    // Don't allow the device to gain new privileges.
    jail.no_new_privs();

    #[cfg(feature = "seccomp_trace")]
    {
        #[repr(C)]
        #[derive(Immutable, IntoBytes)]
        struct sock_filter {
            /* Filter block */
            code: u16, /* Actual filter code */
            jt: u8,    /* Jump true */
            jf: u8,    /* Jump false */
            k: u32,    /* Generic multiuse field */
        }

        // BPF constant is defined in https://elixir.bootlin.com/linux/latest/source/include/uapi/linux/bpf_common.h
        // BPF parser/assembler is defined in https://elixir.bootlin.com/linux/v4.9/source/tools/net/bpf_exp.y
        const SECCOMP_RET_TRACE: u32 = 0x7ff00000;
        const SECCOMP_RET_LOG: u32 = 0x7ffc0000;
        const BPF_RET: u16 = 0x06;
        const BPF_K: u16 = 0x00;

        // return SECCOMP_RET_LOG for all syscalls
        const FILTER_RET_LOG_BLOCK: sock_filter = sock_filter {
            code: BPF_RET | BPF_K,
            jt: 0,
            jf: 0,
            k: SECCOMP_RET_LOG,
        };

        warn!("The running crosvm is compiled with seccomp_trace feature, and is striclty used for debugging purpose only. DO NOT USE IN PRODUCTION!!!");
        debug!(
            "seccomp_trace {{\"event\": \"minijail_create\", \"name\": \"{}\", \"jail_addr\": \"0x{:x}\"}}",
            config.seccomp_policy_name,
            read_jail_addr(&jail),
        );
        jail.parse_seccomp_bytes(FILTER_RET_LOG_BLOCK.as_bytes())
            .unwrap();
    }

    #[cfg(not(feature = "seccomp_trace"))]
    if let Some(seccomp_policy_dir) = config.seccomp_policy_dir {
        let seccomp_policy_path = seccomp_policy_dir.join(config.seccomp_policy_name);
        // By default we'll prioritize using the pre-compiled .bpf over the .policy file (the .bpf
        // is expected to be compiled using "trap" as the failure behavior instead of the default
        // "kill" behavior) when a policy path is supplied in the command line arugments. Otherwise
        // the built-in pre-compiled policies will be used.
        // Refer to the code comment for the "seccomp-log-failures" command-line parameter for an
        // explanation about why the |log_failures| flag forces the use of .policy files (and the
        // build-time alternative to this run-time flag).
        let bpf_policy_file = seccomp_policy_path.with_extension("bpf");
        if bpf_policy_file.exists() && !config.log_failures {
            jail.parse_seccomp_program(&bpf_policy_file)
                .with_context(|| {
                    format!(
                        "failed to parse precompiled seccomp policy: {}",
                        bpf_policy_file.display()
                    )
                })?;
        } else {
            // Use TSYNC only for the side effect of it using SECCOMP_RET_TRAP, which will correctly
            // kill the entire device process if a worker thread commits a seccomp violation.
            jail.set_seccomp_filter_tsync();
            if config.log_failures {
                jail.log_seccomp_filter_failures();
            }
            let bpf_policy_file = seccomp_policy_path.with_extension("policy");
            jail.parse_seccomp_filters(&bpf_policy_file)
                .with_context(|| {
                    format!(
                        "failed to parse seccomp policy: {}",
                        bpf_policy_file.display()
                    )
                })?;
        }
    } else {
        set_embedded_bpf_program(&mut jail, config.seccomp_policy_name)?;
    }

    jail.use_seccomp_filter();
    // Don't do init setup.
    jail.run_as_init();
    // Set up requested remount mode instead of default MS_PRIVATE.
    if let Some(mode) = config.remount_mode {
        jail.set_remount_mode(mode);
    }

    Ok(jail)
}

/// Creates a basic [Minijail] if `jail_config` is present.
///
/// Returns `None` if `jail_config` is none.
pub fn simple_jail(jail_config: Option<&JailConfig>, policy: &str) -> Result<Option<Minijail>> {
    if let Some(jail_config) = jail_config {
        let config = SandboxConfig::new(jail_config, policy);
        Ok(Some(create_sandbox_minijail(
            &jail_config.pivot_root,
            MAX_OPEN_FILES_DEFAULT,
            &config,
        )?))
    } else {
        Ok(None)
    }
}

/// Creates [Minijail] for gpu processes.
pub fn create_gpu_minijail(
    root: &Path,
    config: &SandboxConfig,
    render_node_only: bool,
    snapshot_scratch_directory: Option<&Path>,
) -> Result<Minijail> {
    let mut jail = create_sandbox_minijail(root, MAX_OPEN_FILES_FOR_GPU, config)?;

    // Device nodes required for DRM.
    let sys_dev_char_path = Path::new("/sys/dev/char");
    jail.mount_bind(sys_dev_char_path, sys_dev_char_path, false)?;

    // Necessary for CGROUP control of the vGPU threads
    // This is not necessary UNLESS one wants to make use
    // of the gpu cgroup command line options.
    let sys_cpuset_path = Path::new("/sys/fs/cgroup/cpuset");
    if sys_cpuset_path.exists() {
        jail.mount_bind(sys_cpuset_path, sys_cpuset_path, true)?;
    }

    let sys_devices_path = Path::new("/sys/devices");
    jail.mount_bind(sys_devices_path, sys_devices_path, false)?;

    jail_mount_bind_drm(&mut jail, render_node_only)?;

    // If the ARM specific devices exist on the host, bind mount them in.
    let mali0_path = Path::new("/dev/mali0");
    if mali0_path.exists() {
        jail.mount_bind(mali0_path, mali0_path, true)?;
    }

    let pvr_sync_path = Path::new("/dev/pvr_sync");
    if pvr_sync_path.exists() {
        jail.mount_bind(pvr_sync_path, pvr_sync_path, true)?;
    }

    // If the udmabuf driver exists on the host, bind mount it in.
    let udmabuf_path = Path::new("/dev/udmabuf");
    if udmabuf_path.exists() {
        jail.mount_bind(udmabuf_path, udmabuf_path, true)?;
    }

    // Libraries that are required when mesa drivers are dynamically loaded.
    jail_mount_bind_if_exists(
        &mut jail,
        &[
            "/usr/lib",
            "/usr/lib64",
            "/lib",
            "/lib64",
            "/usr/share/drirc.d",
            "/usr/share/glvnd",
            "/usr/share/libdrm",
            "/usr/share/vulkan",
        ],
    )?;

    // pvr driver requires read access to /proc/self/task/*/comm.
    mount_proc(&mut jail)?;

    // To enable perfetto tracing, we need to give access to the perfetto service IPC
    // endpoints.
    let perfetto_path = Path::new("/run/perfetto");
    if perfetto_path.exists() {
        jail.mount_bind(perfetto_path, perfetto_path, true)?;
    }

    // Provide scratch space for the GPU device to build or unpack snapshots.
    if let Some(snapshot_scratch_directory) = snapshot_scratch_directory {
        jail.mount_with_data(
            Path::new("none"),
            snapshot_scratch_directory,
            "tmpfs",
            (libc::MS_NOSUID | libc::MS_NODEV | libc::MS_NOEXEC) as usize,
            "size=4294967296",
        )?;
    }

    Ok(jail)
}

/// Selectively bind mount drm nodes into `jail` based on `render_node_only`
///
/// This function will not return an error if drm nodes don't exist
pub fn jail_mount_bind_drm(jail: &mut Minijail, render_node_only: bool) -> Result<()> {
    if render_node_only {
        const DRM_NUM_NODES: u32 = 63;
        const DRM_RENDER_NODE_START: u32 = 128;
        for offset in 0..DRM_NUM_NODES {
            let path_str = format!("/dev/dri/renderD{}", DRM_RENDER_NODE_START + offset);
            let drm_dri_path = Path::new(&path_str);
            if !drm_dri_path.exists() {
                break;
            }
            jail.mount_bind(drm_dri_path, drm_dri_path, false)?;
        }
    } else {
        let drm_dri_path = Path::new("/dev/dri");
        if drm_dri_path.exists() {
            jail.mount_bind(drm_dri_path, drm_dri_path, false)?;
        }
    }

    Ok(())
}

/// Mirror-mount all the directories in `dirs` into `jail` on a best-effort basis.
///
/// This function will not return an error if any of the directories in `dirs` is missing.
pub fn jail_mount_bind_if_exists<P: AsRef<std::ffi::OsStr>>(
    jail: &mut Minijail,
    dirs: &[P],
) -> Result<()> {
    for dir in dirs {
        let dir_path = Path::new(dir);
        if dir_path.exists() {
            jail.mount_bind(dir_path, dir_path, false)?;
        }
    }

    Ok(())
}

/// Mount proc in the sandbox.
pub fn mount_proc(jail: &mut Minijail) -> Result<()> {
    jail.mount(
        Path::new("proc"),
        Path::new("/proc"),
        "proc",
        (libc::MS_NOSUID | libc::MS_NODEV | libc::MS_NOEXEC | libc::MS_RDONLY) as usize,
    )?;
    Ok(())
}

/// Read minijail internal struct address for uniquely identifying and tracking jail's lifetime
#[cfg(feature = "seccomp_trace")]
pub fn read_jail_addr(jail: &Minijail) -> usize {
    // We can only hope minijail's rust object will always contain a pointer to C jail struct as the
    // first field.
    const_assert!(std::mem::size_of::<Minijail>() >= std::mem::size_of::<usize>());
    // Safe because it's only doing a read within bound checked by static assert
    unsafe { *(jail as *const Minijail as *const usize) }
}

/// Set the uid/gid for the jailed process and give a basic id map. This is
/// required for bind mounts to work.
fn add_current_user_to_jail(jail: &mut Minijail) -> Result<()> {
    let crosvm_uid = geteuid();
    let crosvm_gid = getegid();

    jail.uidmap(&format!("{0} {0} 1", crosvm_uid))
        .context("error setting UID map")?;
    jail.gidmap(&format!("{0} {0} 1", crosvm_gid))
        .context("error setting GID map")?;

    if crosvm_uid != 0 {
        jail.change_uid(crosvm_uid);
    }
    if crosvm_gid != 0 {
        jail.change_gid(crosvm_gid);
    }
    Ok(())
}

/// Set the seccomp policy for a jail from embedded bpfs
pub fn set_embedded_bpf_program(jail: &mut Minijail, seccomp_policy_name: &str) -> Result<()> {
    let bpf_program = EMBEDDED_BPFS.get(seccomp_policy_name).with_context(|| {
        format!(
            "failed to find embedded seccomp policy: {}",
            seccomp_policy_name
        )
    })?;
    jail.parse_seccomp_bytes(bpf_program).with_context(|| {
        format!(
            "failed to parse embedded seccomp policy: {}",
            seccomp_policy_name
        )
    })?;
    Ok(())
}