• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2017 The ChromiumOS Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #![deny(missing_docs)]
6 #![allow(dead_code)]
7 
8 use std::path::Path;
9 use std::str;
10 
11 use anyhow::bail;
12 use anyhow::Context;
13 use anyhow::Result;
14 #[cfg(feature = "seccomp_trace")]
15 use base::debug;
16 use base::getegid;
17 use base::geteuid;
18 #[cfg(feature = "seccomp_trace")]
19 use base::warn;
20 use libc::c_ulong;
21 use minijail::Minijail;
22 use once_cell::sync::Lazy;
23 #[cfg(feature = "seccomp_trace")]
24 use static_assertions::const_assert;
25 #[cfg(feature = "seccomp_trace")]
26 use zerocopy::Immutable;
27 #[cfg(feature = "seccomp_trace")]
28 use zerocopy::IntoBytes;
29 
30 use crate::config::JailConfig;
31 
32 // ANDROID: b/246968493
33 #[cfg(not(feature = "seccomp_trace"))]
34 static EMBEDDED_BPFS: Lazy<std::collections::HashMap<&str, Vec<u8>>> =
35     Lazy::new(|| std::collections::HashMap::<&str, Vec<u8>>::new());
36 
37 /// Most devices don't need to open many fds.
38 pub const MAX_OPEN_FILES_DEFAULT: u64 = 1024;
39 /// The max open files for gpu processes.
40 const MAX_OPEN_FILES_FOR_GPU: u64 = 32768;
41 /// The max open files for jail warden, matching FD_RAW_FAILURE.
42 pub const MAX_OPEN_FILES_FOR_JAIL_WARDEN: u64 = 65536;
43 
44 /// The user in the jail to run as.
45 pub enum RunAsUser {
46     /// Do not specify the user
47     Unspecified,
48     /// Runs as the same user in the jail as the current user.
49     CurrentUser,
50     /// Runs as the root user in the jail.
51     Root,
52     /// Runs as the specified uid and gid.
53     /// This requires `SandboxConfig::ugid_map` to be set.
54     Specified(u32, u32),
55 }
56 
57 /// Config for the sandbox to be created by [Minijail].
58 pub struct SandboxConfig<'a> {
59     /// Whether or not to drop all capabilities in the sandbox.
60     pub limit_caps: bool,
61     log_failures: bool,
62     seccomp_policy_dir: Option<&'a Path>,
63     seccomp_policy_name: &'a str,
64     /// The pair of `uid_map` and `gid_map`.
65     pub ugid_map: Option<(&'a str, &'a str)>,
66     /// The remount mode instead of default MS_PRIVATE.
67     pub remount_mode: Option<c_ulong>,
68     /// Whether to use empty net namespace. Enabled by default.
69     pub namespace_net: bool,
70     /// Whether or not to configure the jail to support bind-mounts.
71     ///
72     /// Note that most device processes deny `open(2)` and `openat(2)` by seccomp policy and just
73     /// returns `ENOENT`. Passing opened file descriptors is recommended over opening files in the
74     /// sandbox.
75     pub bind_mounts: bool,
76     /// Specify the user in the jail to run as.
77     pub run_as: RunAsUser,
78 }
79 
80 impl<'a> SandboxConfig<'a> {
81     /// Creates [SandboxConfig].
new(jail_config: &'a JailConfig, policy: &'a str) -> Self82     pub fn new(jail_config: &'a JailConfig, policy: &'a str) -> Self {
83         Self {
84             limit_caps: true,
85             log_failures: jail_config.seccomp_log_failures,
86             seccomp_policy_dir: jail_config.seccomp_policy_dir.as_ref().map(Path::new),
87             seccomp_policy_name: policy,
88             ugid_map: None,
89             remount_mode: None,
90             namespace_net: true,
91             bind_mounts: false,
92             run_as: RunAsUser::Unspecified,
93         }
94     }
95 }
96 
97 /// Wrapper that cleans up a [Minijail] when it is dropped
98 pub struct ScopedMinijail(pub Minijail);
99 
100 impl Drop for ScopedMinijail {
drop(&mut self)101     fn drop(&mut self) {
102         let _ = self.0.kill();
103     }
104 }
105 
106 /// Creates a default Minijail instance with no configuration.
create_default_minijail() -> minijail::Result<Minijail>107 pub fn create_default_minijail() -> minijail::Result<Minijail> {
108     Minijail::new().map(|mut jail| {
109         // Temporarily disable multithreaded check due to a regression in linux 6.12.5
110         // TODO(b/395899741): Remove after kernel upstream is fixed.
111         jail.disable_multithreaded_check();
112         jail
113     })
114 }
115 
116 /// Creates a [Minijail] instance which just changes the root using pivot_root(2) path and
117 /// `max_open_files` using `RLIMIT_NOFILE`.
118 ///
119 /// If `root` path is "/", the minijail don't change the root.
120 ///
121 /// # Arguments
122 ///
123 /// * `root` - The root path to be changed to by minijail.
124 /// * `max_open_files` - The maximum number of file descriptors to allow a jailed process to open.
125 #[allow(clippy::unnecessary_cast)]
create_base_minijail(root: &Path, max_open_files: u64) -> Result<Minijail>126 pub fn create_base_minijail(root: &Path, max_open_files: u64) -> Result<Minijail> {
127     // Validate new root directory. Path::is_dir() also checks the existence.
128     if !root.is_dir() {
129         bail!("{:?} is not a directory, cannot create jail", root);
130     }
131     // chroot accepts absolute path only.
132     if !root.is_absolute() {
133         bail!("{:?} is not absolute path", root);
134     }
135 
136     let mut jail = create_default_minijail().context("failed to jail device")?;
137 
138     // Only pivot_root if we are not re-using the current root directory.
139     if root != Path::new("/") {
140         // Run in a new mount namespace.
141         jail.namespace_vfs();
142         jail.enter_pivot_root(root)
143             .context("failed to pivot root device")?;
144     }
145 
146     jail.set_rlimit(libc::RLIMIT_NOFILE as i32, max_open_files, max_open_files)
147         .context("error setting max open files")?;
148 
149     Ok(jail)
150 }
151 
152 /// Creates a [Minijail] instance which just invokes a jail process and sets
153 /// `max_open_files` using `RLIMIT_NOFILE`. This is helpful with crosvm process
154 /// runs as a non-root user without SYS_ADMIN capabilities.
155 ///
156 /// Unlike `create_base_minijail`, this function doesn't call `pivot_root`
157 /// and `mount namespace`. So, it runs as a non-root user without
158 /// SYS_ADMIN capabilities.
159 ///
160 /// Note that since there is no file system isolation provided by this function,
161 /// caller of this function should enforce other security mechanisum such as selinux
162 /// on the host to protect directories.
163 ///
164 /// # Arguments
165 ///
166 /// * `root` - The root path to checked before the process is jailed
167 /// * `max_open_files` - The maximum number of file descriptors to allow a jailed process to open.
168 #[allow(clippy::unnecessary_cast)]
create_base_minijail_without_pivot_root( root: &Path, max_open_files: u64, ) -> Result<Minijail>169 pub fn create_base_minijail_without_pivot_root(
170     root: &Path,
171     max_open_files: u64,
172 ) -> Result<Minijail> {
173     // Validate new root directory. Path::is_dir() also checks the existence.
174     if !root.is_dir() {
175         bail!("{:?} is not a directory, cannot create jail", root);
176     }
177     if !root.is_absolute() {
178         bail!("{:?} is not absolute path", root);
179     }
180 
181     let mut jail = create_default_minijail().context("failed to jail device")?;
182     jail.set_rlimit(libc::RLIMIT_NOFILE as i32, max_open_files, max_open_files)
183         .context("error setting max open files")?;
184 
185     Ok(jail)
186 }
187 
188 /// Creates a [Minijail] instance which creates a sandbox.
189 ///
190 /// # Arguments
191 ///
192 /// * `root` - The root path to be changed to by minijail.
193 /// * `max_open_files` - The maximum number of file descriptors to allow a jailed process to open.
194 /// * `config` - The [SandboxConfig] to control details of the sandbox.
create_sandbox_minijail( root: &Path, max_open_files: u64, config: &SandboxConfig, ) -> Result<Minijail>195 pub fn create_sandbox_minijail(
196     root: &Path,
197     max_open_files: u64,
198     config: &SandboxConfig,
199 ) -> Result<Minijail> {
200     let mut jail = create_base_minijail(root, max_open_files)?;
201 
202     jail.namespace_pids();
203     jail.namespace_user();
204     jail.namespace_user_disable_setgroups();
205     if config.limit_caps {
206         // Don't need any capabilities.
207         jail.use_caps(0);
208     }
209     match config.run_as {
210         RunAsUser::Unspecified => {
211             if config.bind_mounts && config.ugid_map.is_none() {
212                 // Minijail requires to set user/group map to mount extra directories.
213                 add_current_user_to_jail(&mut jail)?;
214             }
215         }
216         RunAsUser::CurrentUser => {
217             add_current_user_to_jail(&mut jail)?;
218         }
219         RunAsUser::Root => {
220             // Add the current user as root in the jail.
221             let crosvm_uid = geteuid();
222             let crosvm_gid = getegid();
223             jail.uidmap(&format!("0 {} 1", crosvm_uid))
224                 .context("error setting UID map")?;
225             jail.gidmap(&format!("0 {} 1", crosvm_gid))
226                 .context("error setting GID map")?;
227         }
228         RunAsUser::Specified(uid, gid) => {
229             if uid != 0 {
230                 jail.change_uid(uid)
231             }
232             if gid != 0 {
233                 jail.change_gid(gid)
234             }
235         }
236     }
237     if config.bind_mounts {
238         // Create a tmpfs in the device's root directory so that we can bind mount files.
239         // The size=67108864 is size=64*1024*1024 or size=64MB.
240         // TODO(b/267581374): Use appropriate size for tmpfs.
241         jail.mount_with_data(
242             Path::new("none"),
243             Path::new("/"),
244             "tmpfs",
245             (libc::MS_NOSUID | libc::MS_NODEV | libc::MS_NOEXEC) as usize,
246             "size=67108864",
247         )?;
248     }
249     if let Some((uid_map, gid_map)) = config.ugid_map {
250         jail.uidmap(uid_map).context("error setting UID map")?;
251         jail.gidmap(gid_map).context("error setting GID map")?;
252     }
253     // Run in a new mount namespace.
254     jail.namespace_vfs();
255 
256     if config.namespace_net {
257         // Run in an empty network namespace.
258         jail.namespace_net();
259     }
260 
261     // Don't allow the device to gain new privileges.
262     jail.no_new_privs();
263 
264     #[cfg(feature = "seccomp_trace")]
265     {
266         #[repr(C)]
267         #[derive(Immutable, IntoBytes)]
268         struct sock_filter {
269             /* Filter block */
270             code: u16, /* Actual filter code */
271             jt: u8,    /* Jump true */
272             jf: u8,    /* Jump false */
273             k: u32,    /* Generic multiuse field */
274         }
275 
276         // BPF constant is defined in https://elixir.bootlin.com/linux/latest/source/include/uapi/linux/bpf_common.h
277         // BPF parser/assembler is defined in https://elixir.bootlin.com/linux/v4.9/source/tools/net/bpf_exp.y
278         const SECCOMP_RET_TRACE: u32 = 0x7ff00000;
279         const SECCOMP_RET_LOG: u32 = 0x7ffc0000;
280         const BPF_RET: u16 = 0x06;
281         const BPF_K: u16 = 0x00;
282 
283         // return SECCOMP_RET_LOG for all syscalls
284         const FILTER_RET_LOG_BLOCK: sock_filter = sock_filter {
285             code: BPF_RET | BPF_K,
286             jt: 0,
287             jf: 0,
288             k: SECCOMP_RET_LOG,
289         };
290 
291         warn!("The running crosvm is compiled with seccomp_trace feature, and is striclty used for debugging purpose only. DO NOT USE IN PRODUCTION!!!");
292         debug!(
293             "seccomp_trace {{\"event\": \"minijail_create\", \"name\": \"{}\", \"jail_addr\": \"0x{:x}\"}}",
294             config.seccomp_policy_name,
295             read_jail_addr(&jail),
296         );
297         jail.parse_seccomp_bytes(FILTER_RET_LOG_BLOCK.as_bytes())
298             .unwrap();
299     }
300 
301     #[cfg(not(feature = "seccomp_trace"))]
302     if let Some(seccomp_policy_dir) = config.seccomp_policy_dir {
303         let seccomp_policy_path = seccomp_policy_dir.join(config.seccomp_policy_name);
304         // By default we'll prioritize using the pre-compiled .bpf over the .policy file (the .bpf
305         // is expected to be compiled using "trap" as the failure behavior instead of the default
306         // "kill" behavior) when a policy path is supplied in the command line arugments. Otherwise
307         // the built-in pre-compiled policies will be used.
308         // Refer to the code comment for the "seccomp-log-failures" command-line parameter for an
309         // explanation about why the |log_failures| flag forces the use of .policy files (and the
310         // build-time alternative to this run-time flag).
311         let bpf_policy_file = seccomp_policy_path.with_extension("bpf");
312         if bpf_policy_file.exists() && !config.log_failures {
313             jail.parse_seccomp_program(&bpf_policy_file)
314                 .with_context(|| {
315                     format!(
316                         "failed to parse precompiled seccomp policy: {}",
317                         bpf_policy_file.display()
318                     )
319                 })?;
320         } else {
321             // Use TSYNC only for the side effect of it using SECCOMP_RET_TRAP, which will correctly
322             // kill the entire device process if a worker thread commits a seccomp violation.
323             jail.set_seccomp_filter_tsync();
324             if config.log_failures {
325                 jail.log_seccomp_filter_failures();
326             }
327             let bpf_policy_file = seccomp_policy_path.with_extension("policy");
328             jail.parse_seccomp_filters(&bpf_policy_file)
329                 .with_context(|| {
330                     format!(
331                         "failed to parse seccomp policy: {}",
332                         bpf_policy_file.display()
333                     )
334                 })?;
335         }
336     } else {
337         set_embedded_bpf_program(&mut jail, config.seccomp_policy_name)?;
338     }
339 
340     jail.use_seccomp_filter();
341     // Don't do init setup.
342     jail.run_as_init();
343     // Set up requested remount mode instead of default MS_PRIVATE.
344     if let Some(mode) = config.remount_mode {
345         jail.set_remount_mode(mode);
346     }
347 
348     Ok(jail)
349 }
350 
351 /// Creates a basic [Minijail] if `jail_config` is present.
352 ///
353 /// Returns `None` if `jail_config` is none.
simple_jail(jail_config: Option<&JailConfig>, policy: &str) -> Result<Option<Minijail>>354 pub fn simple_jail(jail_config: Option<&JailConfig>, policy: &str) -> Result<Option<Minijail>> {
355     if let Some(jail_config) = jail_config {
356         let config = SandboxConfig::new(jail_config, policy);
357         Ok(Some(create_sandbox_minijail(
358             &jail_config.pivot_root,
359             MAX_OPEN_FILES_DEFAULT,
360             &config,
361         )?))
362     } else {
363         Ok(None)
364     }
365 }
366 
367 /// Creates [Minijail] for gpu processes.
create_gpu_minijail( root: &Path, config: &SandboxConfig, render_node_only: bool, snapshot_scratch_directory: Option<&Path>, ) -> Result<Minijail>368 pub fn create_gpu_minijail(
369     root: &Path,
370     config: &SandboxConfig,
371     render_node_only: bool,
372     snapshot_scratch_directory: Option<&Path>,
373 ) -> Result<Minijail> {
374     let mut jail = create_sandbox_minijail(root, MAX_OPEN_FILES_FOR_GPU, config)?;
375 
376     // Device nodes required for DRM.
377     let sys_dev_char_path = Path::new("/sys/dev/char");
378     jail.mount_bind(sys_dev_char_path, sys_dev_char_path, false)?;
379 
380     // Necessary for CGROUP control of the vGPU threads
381     // This is not necessary UNLESS one wants to make use
382     // of the gpu cgroup command line options.
383     let sys_cpuset_path = Path::new("/sys/fs/cgroup/cpuset");
384     if sys_cpuset_path.exists() {
385         jail.mount_bind(sys_cpuset_path, sys_cpuset_path, true)?;
386     }
387 
388     let sys_devices_path = Path::new("/sys/devices");
389     jail.mount_bind(sys_devices_path, sys_devices_path, false)?;
390 
391     jail_mount_bind_drm(&mut jail, render_node_only)?;
392 
393     // If the ARM specific devices exist on the host, bind mount them in.
394     let mali0_path = Path::new("/dev/mali0");
395     if mali0_path.exists() {
396         jail.mount_bind(mali0_path, mali0_path, true)?;
397     }
398 
399     let pvr_sync_path = Path::new("/dev/pvr_sync");
400     if pvr_sync_path.exists() {
401         jail.mount_bind(pvr_sync_path, pvr_sync_path, true)?;
402     }
403 
404     // If the udmabuf driver exists on the host, bind mount it in.
405     let udmabuf_path = Path::new("/dev/udmabuf");
406     if udmabuf_path.exists() {
407         jail.mount_bind(udmabuf_path, udmabuf_path, true)?;
408     }
409 
410     // Libraries that are required when mesa drivers are dynamically loaded.
411     jail_mount_bind_if_exists(
412         &mut jail,
413         &[
414             "/usr/lib",
415             "/usr/lib64",
416             "/lib",
417             "/lib64",
418             "/usr/share/drirc.d",
419             "/usr/share/glvnd",
420             "/usr/share/libdrm",
421             "/usr/share/vulkan",
422         ],
423     )?;
424 
425     // pvr driver requires read access to /proc/self/task/*/comm.
426     mount_proc(&mut jail)?;
427 
428     // To enable perfetto tracing, we need to give access to the perfetto service IPC
429     // endpoints.
430     let perfetto_path = Path::new("/run/perfetto");
431     if perfetto_path.exists() {
432         jail.mount_bind(perfetto_path, perfetto_path, true)?;
433     }
434 
435     // Provide scratch space for the GPU device to build or unpack snapshots.
436     if let Some(snapshot_scratch_directory) = snapshot_scratch_directory {
437         jail.mount_with_data(
438             Path::new("none"),
439             snapshot_scratch_directory,
440             "tmpfs",
441             (libc::MS_NOSUID | libc::MS_NODEV | libc::MS_NOEXEC) as usize,
442             "size=4294967296",
443         )?;
444     }
445 
446     Ok(jail)
447 }
448 
449 /// Selectively bind mount drm nodes into `jail` based on `render_node_only`
450 ///
451 /// This function will not return an error if drm nodes don't exist
jail_mount_bind_drm(jail: &mut Minijail, render_node_only: bool) -> Result<()>452 pub fn jail_mount_bind_drm(jail: &mut Minijail, render_node_only: bool) -> Result<()> {
453     if render_node_only {
454         const DRM_NUM_NODES: u32 = 63;
455         const DRM_RENDER_NODE_START: u32 = 128;
456         for offset in 0..DRM_NUM_NODES {
457             let path_str = format!("/dev/dri/renderD{}", DRM_RENDER_NODE_START + offset);
458             let drm_dri_path = Path::new(&path_str);
459             if !drm_dri_path.exists() {
460                 break;
461             }
462             jail.mount_bind(drm_dri_path, drm_dri_path, false)?;
463         }
464     } else {
465         let drm_dri_path = Path::new("/dev/dri");
466         if drm_dri_path.exists() {
467             jail.mount_bind(drm_dri_path, drm_dri_path, false)?;
468         }
469     }
470 
471     Ok(())
472 }
473 
474 /// Mirror-mount all the directories in `dirs` into `jail` on a best-effort basis.
475 ///
476 /// This function will not return an error if any of the directories in `dirs` is missing.
jail_mount_bind_if_exists<P: AsRef<std::ffi::OsStr>>( jail: &mut Minijail, dirs: &[P], ) -> Result<()>477 pub fn jail_mount_bind_if_exists<P: AsRef<std::ffi::OsStr>>(
478     jail: &mut Minijail,
479     dirs: &[P],
480 ) -> Result<()> {
481     for dir in dirs {
482         let dir_path = Path::new(dir);
483         if dir_path.exists() {
484             jail.mount_bind(dir_path, dir_path, false)?;
485         }
486     }
487 
488     Ok(())
489 }
490 
491 /// Mount proc in the sandbox.
mount_proc(jail: &mut Minijail) -> Result<()>492 pub fn mount_proc(jail: &mut Minijail) -> Result<()> {
493     jail.mount(
494         Path::new("proc"),
495         Path::new("/proc"),
496         "proc",
497         (libc::MS_NOSUID | libc::MS_NODEV | libc::MS_NOEXEC | libc::MS_RDONLY) as usize,
498     )?;
499     Ok(())
500 }
501 
502 /// Read minijail internal struct address for uniquely identifying and tracking jail's lifetime
503 #[cfg(feature = "seccomp_trace")]
read_jail_addr(jail: &Minijail) -> usize504 pub fn read_jail_addr(jail: &Minijail) -> usize {
505     // We can only hope minijail's rust object will always contain a pointer to C jail struct as the
506     // first field.
507     const_assert!(std::mem::size_of::<Minijail>() >= std::mem::size_of::<usize>());
508     // Safe because it's only doing a read within bound checked by static assert
509     unsafe { *(jail as *const Minijail as *const usize) }
510 }
511 
512 /// Set the uid/gid for the jailed process and give a basic id map. This is
513 /// required for bind mounts to work.
add_current_user_to_jail(jail: &mut Minijail) -> Result<()>514 fn add_current_user_to_jail(jail: &mut Minijail) -> Result<()> {
515     let crosvm_uid = geteuid();
516     let crosvm_gid = getegid();
517 
518     jail.uidmap(&format!("{0} {0} 1", crosvm_uid))
519         .context("error setting UID map")?;
520     jail.gidmap(&format!("{0} {0} 1", crosvm_gid))
521         .context("error setting GID map")?;
522 
523     if crosvm_uid != 0 {
524         jail.change_uid(crosvm_uid);
525     }
526     if crosvm_gid != 0 {
527         jail.change_gid(crosvm_gid);
528     }
529     Ok(())
530 }
531 
532 /// Set the seccomp policy for a jail from embedded bpfs
set_embedded_bpf_program(jail: &mut Minijail, seccomp_policy_name: &str) -> Result<()>533 pub fn set_embedded_bpf_program(jail: &mut Minijail, seccomp_policy_name: &str) -> Result<()> {
534     let bpf_program = EMBEDDED_BPFS.get(seccomp_policy_name).with_context(|| {
535         format!(
536             "failed to find embedded seccomp policy: {}",
537             seccomp_policy_name
538         )
539     })?;
540     jail.parse_seccomp_bytes(bpf_program).with_context(|| {
541         format!(
542             "failed to parse embedded seccomp policy: {}",
543             seccomp_policy_name
544         )
545     })?;
546     Ok(())
547 }
548