• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2017 The ChromiumOS Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #![deny(missing_docs)]
6 #![allow(dead_code)]
7 
8 use std::path::Path;
9 use std::str;
10 
11 use anyhow::bail;
12 use anyhow::Context;
13 use anyhow::Result;
14 #[cfg(feature = "seccomp_trace")]
15 use base::debug;
16 use base::getegid;
17 use base::geteuid;
18 #[cfg(feature = "seccomp_trace")]
19 use base::warn;
20 use libc::c_ulong;
21 use minijail::Minijail;
22 #[cfg(not(feature = "seccomp_trace"))]
23 use once_cell::sync::Lazy;
24 #[cfg(feature = "seccomp_trace")]
25 use static_assertions::assert_eq_size;
26 #[cfg(feature = "seccomp_trace")]
27 use zerocopy::AsBytes;
28 
29 use crate::config::JailConfig;
30 
31 // ANDROID: b/246968493
32 #[cfg(not(feature = "seccomp_trace"))]
33 static EMBEDDED_BPFS: Lazy<std::collections::HashMap<&str, Vec<u8>>> =
34     Lazy::new(|| std::collections::HashMap::<&str, Vec<u8>>::new());
35 
36 /// Most devices don't need to open many fds.
37 pub const MAX_OPEN_FILES_DEFAULT: u64 = 1024;
38 /// The max open files for gpu processes.
39 const MAX_OPEN_FILES_FOR_GPU: u64 = 32768;
40 /// The max open files for jail warden, matching FD_RAW_FAILURE.
41 pub const MAX_OPEN_FILES_FOR_JAIL_WARDEN: u64 = 65536;
42 
43 /// The user in the jail to run as.
44 pub enum RunAsUser {
45     /// Do not specify the user
46     Unspecified,
47     /// Runs as the same user in the jail as the current user.
48     CurrentUser,
49     /// Runs as the root user in the jail.
50     Root,
51     /// Runs as the specified uid and gid.
52     /// This requires `SandboxConfig::ugid_map` to be set.
53     Specified(u32, u32),
54 }
55 
56 /// Config for the sandbox to be created by [Minijail].
57 pub struct SandboxConfig<'a> {
58     /// Whether or not to drop all capabilities in the sandbox.
59     pub limit_caps: bool,
60     log_failures: bool,
61     seccomp_policy_dir: Option<&'a Path>,
62     seccomp_policy_name: &'a str,
63     /// The pair of `uid_map` and `gid_map`.
64     pub ugid_map: Option<(&'a str, &'a str)>,
65     /// The remount mode instead of default MS_PRIVATE.
66     pub remount_mode: Option<c_ulong>,
67     /// Whether to use empty net namespace. Enabled by default.
68     pub namespace_net: bool,
69     /// Whether or not to configure the jail to support bind-mounts.
70     ///
71     /// Note that most device processes deny `open(2)` and `openat(2)` by seccomp policy and just
72     /// returns `ENOENT`. Passing opened file descriptors is recommended over opening files in the
73     /// sandbox.
74     pub bind_mounts: bool,
75     /// Specify the user in the jail to run as.
76     pub run_as: RunAsUser,
77 }
78 
79 impl<'a> SandboxConfig<'a> {
80     /// Creates [SandboxConfig].
new(jail_config: &'a JailConfig, policy: &'a str) -> Self81     pub fn new(jail_config: &'a JailConfig, policy: &'a str) -> Self {
82         Self {
83             limit_caps: true,
84             log_failures: jail_config.seccomp_log_failures,
85             seccomp_policy_dir: jail_config.seccomp_policy_dir.as_ref().map(Path::new),
86             seccomp_policy_name: policy,
87             ugid_map: None,
88             remount_mode: None,
89             namespace_net: true,
90             bind_mounts: false,
91             run_as: RunAsUser::Unspecified,
92         }
93     }
94 }
95 
96 /// Wrapper that cleans up a [Minijail] when it is dropped
97 pub struct ScopedMinijail(pub Minijail);
98 
99 impl Drop for ScopedMinijail {
drop(&mut self)100     fn drop(&mut self) {
101         let _ = self.0.kill();
102     }
103 }
104 
105 /// Creates a [Minijail] instance which just changes the root using pivot_root(2) path and
106 /// `max_open_files` using `RLIMIT_NOFILE`.
107 ///
108 /// If `root` path is "/", the minijail don't change the root.
109 ///
110 /// # Arguments
111 ///
112 /// * `root` - The root path to be changed to by minijail.
113 /// * `max_open_files` - The maximum number of file descriptors to allow a jailed process to open.
create_base_minijail(root: &Path, max_open_files: u64) -> Result<Minijail>114 pub fn create_base_minijail(root: &Path, max_open_files: u64) -> Result<Minijail> {
115     // Validate new root directory. Path::is_dir() also checks the existence.
116     if !root.is_dir() {
117         bail!("{:?} is not a directory, cannot create jail", root);
118     }
119     // chroot accepts absolute path only.
120     if !root.is_absolute() {
121         bail!("{:?} is not absolute path", root);
122     }
123 
124     // All child jails run in a new user namespace without any users mapped, they run as nobody
125     // unless otherwise configured.
126     let mut jail = Minijail::new().context("failed to jail device")?;
127 
128     // Only pivot_root if we are not re-using the current root directory.
129     if root != Path::new("/") {
130         // It's safe to call `namespace_vfs` multiple times.
131         jail.namespace_vfs();
132         jail.enter_pivot_root(root)
133             .context("failed to pivot root device")?;
134     }
135 
136     jail.set_rlimit(libc::RLIMIT_NOFILE as i32, max_open_files, max_open_files)
137         .context("error setting max open files")?;
138 
139     Ok(jail)
140 }
141 
142 /// Creates a [Minijail] instance which creates a sandbox.
143 ///
144 /// # Arguments
145 ///
146 /// * `root` - The root path to be changed to by minijail.
147 /// * `max_open_files` - The maximum number of file descriptors to allow a jailed process to open.
148 /// * `config` - The [SandboxConfig] to control details of the sandbox.
create_sandbox_minijail( root: &Path, max_open_files: u64, config: &SandboxConfig, ) -> Result<Minijail>149 pub fn create_sandbox_minijail(
150     root: &Path,
151     max_open_files: u64,
152     config: &SandboxConfig,
153 ) -> Result<Minijail> {
154     let mut jail = create_base_minijail(root, max_open_files)?;
155 
156     jail.namespace_pids();
157     jail.namespace_user();
158     jail.namespace_user_disable_setgroups();
159     if config.limit_caps {
160         // Don't need any capabilities.
161         jail.use_caps(0);
162     }
163     match config.run_as {
164         RunAsUser::Unspecified => {
165             if config.bind_mounts && config.ugid_map.is_none() {
166                 // Minijail requires to set user/group map to mount extra directories.
167                 add_current_user_to_jail(&mut jail)?;
168             }
169         }
170         RunAsUser::CurrentUser => {
171             add_current_user_to_jail(&mut jail)?;
172         }
173         RunAsUser::Root => {
174             // Add the current user as root in the jail.
175             let crosvm_uid = geteuid();
176             let crosvm_gid = getegid();
177             jail.uidmap(&format!("0 {} 1", crosvm_uid))
178                 .context("error setting UID map")?;
179             jail.gidmap(&format!("0 {} 1", crosvm_gid))
180                 .context("error setting GID map")?;
181         }
182         RunAsUser::Specified(uid, gid) => {
183             if uid != 0 {
184                 jail.change_uid(uid)
185             }
186             if gid != 0 {
187                 jail.change_gid(gid)
188             }
189         }
190     }
191     if config.bind_mounts {
192         // Create a tmpfs in the device's root directory so that we can bind mount files.
193         // The size=67108864 is size=64*1024*1024 or size=64MB.
194         // TODO(b/267581374): Use appropriate size for tmpfs.
195         jail.mount_with_data(
196             Path::new("none"),
197             Path::new("/"),
198             "tmpfs",
199             (libc::MS_NOSUID | libc::MS_NODEV | libc::MS_NOEXEC) as usize,
200             "size=67108864",
201         )?;
202     }
203     if let Some((uid_map, gid_map)) = config.ugid_map {
204         jail.uidmap(uid_map).context("error setting UID map")?;
205         jail.gidmap(gid_map).context("error setting GID map")?;
206     }
207     // Run in a new mount namespace.
208     jail.namespace_vfs();
209 
210     if config.namespace_net {
211         // Run in an empty network namespace.
212         jail.namespace_net();
213     }
214 
215     // Don't allow the device to gain new privileges.
216     jail.no_new_privs();
217 
218     #[cfg(feature = "seccomp_trace")]
219     {
220         #[repr(C)]
221         #[derive(AsBytes)]
222         struct sock_filter {
223             /* Filter block */
224             code: u16, /* Actual filter code */
225             jt: u8,    /* Jump true */
226             jf: u8,    /* Jump false */
227             k: u32,    /* Generic multiuse field */
228         }
229 
230         // BPF constant is defined in https://elixir.bootlin.com/linux/latest/source/include/uapi/linux/bpf_common.h
231         // BPF parser/assembler is defined in https://elixir.bootlin.com/linux/v4.9/source/tools/net/bpf_exp.y
232         const SECCOMP_RET_TRACE: u32 = 0x7ff00000;
233         const SECCOMP_RET_LOG: u32 = 0x7ffc0000;
234         const BPF_RET: u16 = 0x06;
235         const BPF_K: u16 = 0x00;
236 
237         // return SECCOMP_RET_LOG for all syscalls
238         const FILTER_RET_LOG_BLOCK: sock_filter = sock_filter {
239             code: BPF_RET | BPF_K,
240             jt: 0,
241             jf: 0,
242             k: SECCOMP_RET_LOG,
243         };
244 
245         warn!("The running crosvm is compiled with seccomp_trace feature, and is striclty used for debugging purpose only. DO NOT USE IN PRODUCTION!!!");
246         debug!(
247             "seccomp_trace {{\"event\": \"minijail_create\", \"name\": \"{}\", \"jail_addr\": \"0x{:x}\"}}",
248             config.seccomp_policy_name,
249             read_jail_addr(&jail),
250         );
251         jail.parse_seccomp_bytes(FILTER_RET_LOG_BLOCK.as_bytes())
252             .unwrap();
253     }
254 
255     #[cfg(not(feature = "seccomp_trace"))]
256     if let Some(seccomp_policy_dir) = config.seccomp_policy_dir {
257         let seccomp_policy_path = seccomp_policy_dir.join(config.seccomp_policy_name);
258         // By default we'll prioritize using the pre-compiled .bpf over the .policy file (the .bpf
259         // is expected to be compiled using "trap" as the failure behavior instead of the default
260         // "kill" behavior) when a policy path is supplied in the command line arugments. Otherwise
261         // the built-in pre-compiled policies will be used.
262         // Refer to the code comment for the "seccomp-log-failures" command-line parameter for an
263         // explanation about why the |log_failures| flag forces the use of .policy files (and the
264         // build-time alternative to this run-time flag).
265         let bpf_policy_file = seccomp_policy_path.with_extension("bpf");
266         if bpf_policy_file.exists() && !config.log_failures {
267             jail.parse_seccomp_program(&bpf_policy_file)
268                 .with_context(|| {
269                     format!(
270                         "failed to parse precompiled seccomp policy: {}",
271                         bpf_policy_file.display()
272                     )
273                 })?;
274         } else {
275             // Use TSYNC only for the side effect of it using SECCOMP_RET_TRAP, which will correctly
276             // kill the entire device process if a worker thread commits a seccomp violation.
277             jail.set_seccomp_filter_tsync();
278             if config.log_failures {
279                 jail.log_seccomp_filter_failures();
280             }
281             let bpf_policy_file = seccomp_policy_path.with_extension("policy");
282             jail.parse_seccomp_filters(&bpf_policy_file)
283                 .with_context(|| {
284                     format!(
285                         "failed to parse seccomp policy: {}",
286                         bpf_policy_file.display()
287                     )
288                 })?;
289         }
290     } else {
291         let bpf_program = EMBEDDED_BPFS
292             .get(&config.seccomp_policy_name)
293             .with_context(|| {
294                 format!(
295                     "failed to find embedded seccomp policy: {}",
296                     &config.seccomp_policy_name
297                 )
298             })?;
299         jail.parse_seccomp_bytes(bpf_program).with_context(|| {
300             format!(
301                 "failed to parse embedded seccomp policy: {}",
302                 &config.seccomp_policy_name
303             )
304         })?;
305     }
306 
307     jail.use_seccomp_filter();
308     // Don't do init setup.
309     jail.run_as_init();
310     // Set up requested remount mode instead of default MS_PRIVATE.
311     if let Some(mode) = config.remount_mode {
312         jail.set_remount_mode(mode);
313     }
314 
315     Ok(jail)
316 }
317 
318 /// Creates a basic [Minijail] if `jail_config` is present.
319 ///
320 /// Returns `None` if `jail_config` is none.
simple_jail(jail_config: &Option<JailConfig>, policy: &str) -> Result<Option<Minijail>>321 pub fn simple_jail(jail_config: &Option<JailConfig>, policy: &str) -> Result<Option<Minijail>> {
322     if let Some(jail_config) = jail_config {
323         let config = SandboxConfig::new(jail_config, policy);
324         Ok(Some(create_sandbox_minijail(
325             &jail_config.pivot_root,
326             MAX_OPEN_FILES_DEFAULT,
327             &config,
328         )?))
329     } else {
330         Ok(None)
331     }
332 }
333 
334 /// Creates [Minijail] for gpu processes.
create_gpu_minijail( root: &Path, config: &SandboxConfig, render_node_only: bool, ) -> Result<Minijail>335 pub fn create_gpu_minijail(
336     root: &Path,
337     config: &SandboxConfig,
338     render_node_only: bool,
339 ) -> Result<Minijail> {
340     let mut jail = create_sandbox_minijail(root, MAX_OPEN_FILES_FOR_GPU, config)?;
341 
342     // Device nodes required for DRM.
343     let sys_dev_char_path = Path::new("/sys/dev/char");
344     jail.mount_bind(sys_dev_char_path, sys_dev_char_path, false)?;
345 
346     // Necessary for CGROUP control of the vGPU threads
347     // This is not necessary UNLESS one wants to make use
348     // of the gpu cgroup command line options.
349     let sys_cpuset_path = Path::new("/sys/fs/cgroup/cpuset");
350     if sys_cpuset_path.exists() {
351         jail.mount_bind(sys_cpuset_path, sys_cpuset_path, true)?;
352     }
353 
354     let sys_devices_path = Path::new("/sys/devices");
355     jail.mount_bind(sys_devices_path, sys_devices_path, false)?;
356 
357     jail_mount_bind_drm(&mut jail, render_node_only)?;
358 
359     // If the ARM specific devices exist on the host, bind mount them in.
360     let mali0_path = Path::new("/dev/mali0");
361     if mali0_path.exists() {
362         jail.mount_bind(mali0_path, mali0_path, true)?;
363     }
364 
365     let pvr_sync_path = Path::new("/dev/pvr_sync");
366     if pvr_sync_path.exists() {
367         jail.mount_bind(pvr_sync_path, pvr_sync_path, true)?;
368     }
369 
370     // If the udmabuf driver exists on the host, bind mount it in.
371     let udmabuf_path = Path::new("/dev/udmabuf");
372     if udmabuf_path.exists() {
373         jail.mount_bind(udmabuf_path, udmabuf_path, true)?;
374     }
375 
376     // Libraries that are required when mesa drivers are dynamically loaded.
377     jail_mount_bind_if_exists(
378         &mut jail,
379         &[
380             "/usr/lib",
381             "/usr/lib64",
382             "/lib",
383             "/lib64",
384             "/usr/share/drirc.d",
385             "/usr/share/glvnd",
386             "/usr/share/libdrm",
387             "/usr/share/vulkan",
388         ],
389     )?;
390 
391     // pvr driver requires read access to /proc/self/task/*/comm.
392     mount_proc(&mut jail)?;
393 
394     // To enable perfetto tracing, we need to give access to the perfetto service IPC
395     // endpoints.
396     let perfetto_path = Path::new("/run/perfetto");
397     if perfetto_path.exists() {
398         jail.mount_bind(perfetto_path, perfetto_path, true)?;
399     }
400 
401     Ok(jail)
402 }
403 
404 /// Selectively bind mount drm nodes into `jail` based on `render_node_only`
405 ///
406 /// This function will not return an error if drm nodes don't exist
jail_mount_bind_drm(jail: &mut Minijail, render_node_only: bool) -> Result<()>407 pub fn jail_mount_bind_drm(jail: &mut Minijail, render_node_only: bool) -> Result<()> {
408     if render_node_only {
409         const DRM_NUM_NODES: u32 = 63;
410         const DRM_RENDER_NODE_START: u32 = 128;
411         for offset in 0..DRM_NUM_NODES {
412             let path_str = format!("/dev/dri/renderD{}", DRM_RENDER_NODE_START + offset);
413             let drm_dri_path = Path::new(&path_str);
414             if !drm_dri_path.exists() {
415                 break;
416             }
417             jail.mount_bind(drm_dri_path, drm_dri_path, false)?;
418         }
419     } else {
420         let drm_dri_path = Path::new("/dev/dri");
421         if drm_dri_path.exists() {
422             jail.mount_bind(drm_dri_path, drm_dri_path, false)?;
423         }
424     }
425 
426     Ok(())
427 }
428 
429 /// Mirror-mount all the directories in `dirs` into `jail` on a best-effort basis.
430 ///
431 /// This function will not return an error if any of the directories in `dirs` is missing.
jail_mount_bind_if_exists<P: AsRef<std::ffi::OsStr>>( jail: &mut Minijail, dirs: &[P], ) -> Result<()>432 pub fn jail_mount_bind_if_exists<P: AsRef<std::ffi::OsStr>>(
433     jail: &mut Minijail,
434     dirs: &[P],
435 ) -> Result<()> {
436     for dir in dirs {
437         let dir_path = Path::new(dir);
438         if dir_path.exists() {
439             jail.mount_bind(dir_path, dir_path, false)?;
440         }
441     }
442 
443     Ok(())
444 }
445 
446 /// Mount proc in the sandbox.
mount_proc(jail: &mut Minijail) -> Result<()>447 pub fn mount_proc(jail: &mut Minijail) -> Result<()> {
448     jail.mount(
449         Path::new("proc"),
450         Path::new("/proc"),
451         "proc",
452         (libc::MS_NOSUID | libc::MS_NODEV | libc::MS_NOEXEC | libc::MS_RDONLY) as usize,
453     )?;
454     Ok(())
455 }
456 
457 /// Read minijail internal struct address for uniquely identifying and tracking jail's lifetime
458 #[cfg(feature = "seccomp_trace")]
read_jail_addr(jail: &Minijail) -> usize459 pub fn read_jail_addr(jail: &Minijail) -> usize {
460     // We can only hope minijail's rust object will always only contain a pointer to C jail struct
461     assert_eq_size!(Minijail, usize);
462     // Safe because it's only doing a read within bound checked by static assert
463     unsafe { *(jail as *const Minijail as *const usize) }
464 }
465 
466 /// Set the uid/gid for the jailed process and give a basic id map. This is
467 /// required for bind mounts to work.
add_current_user_to_jail(jail: &mut Minijail) -> Result<()>468 fn add_current_user_to_jail(jail: &mut Minijail) -> Result<()> {
469     let crosvm_uid = geteuid();
470     let crosvm_gid = getegid();
471 
472     jail.uidmap(&format!("{0} {0} 1", crosvm_uid))
473         .context("error setting UID map")?;
474     jail.gidmap(&format!("{0} {0} 1", crosvm_gid))
475         .context("error setting GID map")?;
476 
477     if crosvm_uid != 0 {
478         jail.change_uid(crosvm_uid);
479     }
480     if crosvm_gid != 0 {
481         jail.change_gid(crosvm_gid);
482     }
483     Ok(())
484 }
485