• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2017 The ChromiumOS Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #![deny(missing_docs)]
6 #![allow(dead_code)]
7 
8 use std::path::Path;
9 use std::str;
10 
11 use anyhow::bail;
12 use anyhow::Context;
13 use anyhow::Result;
14 #[cfg(feature = "seccomp_trace")]
15 use base::debug;
16 use base::getegid;
17 use base::geteuid;
18 #[cfg(feature = "seccomp_trace")]
19 use base::warn;
20 use libc::c_ulong;
21 use minijail::Minijail;
22 #[cfg(not(feature = "seccomp_trace"))]
23 use once_cell::sync::Lazy;
24 #[cfg(feature = "seccomp_trace")]
25 use static_assertions::assert_eq_size;
26 #[cfg(feature = "seccomp_trace")]
27 use zerocopy::AsBytes;
28 
29 use crate::config::JailConfig;
30 
31 // ANDROID: b/246968493
32 #[cfg(not(feature = "seccomp_trace"))]
33 static EMBEDDED_BPFS: Lazy<std::collections::HashMap<&str, Vec<u8>>> =
34     Lazy::new(|| std::collections::HashMap::<&str, Vec<u8>>::new());
35 
36 /// Most devices don't need to open many fds.
37 pub const MAX_OPEN_FILES_DEFAULT: u64 = 1024;
38 /// The max open files for gpu processes.
39 const MAX_OPEN_FILES_FOR_GPU: u64 = 32768;
40 
41 /// The user in the jail to run as.
42 pub enum RunAsUser {
43     /// Do not specify the user
44     Unspecified,
45     /// Runs as the same user in the jail as the current user.
46     CurrentUser,
47     /// Runs as the root user in the jail.
48     Root,
49 }
50 
51 /// Config for the sandbox to be created by [Minijail].
52 pub struct SandboxConfig<'a> {
53     /// Whether or not to drop all capabilities in the sandbox.
54     pub limit_caps: bool,
55     log_failures: bool,
56     seccomp_policy_dir: Option<&'a Path>,
57     seccomp_policy_name: &'a str,
58     /// The pair of `uid_map` and `gid_map`.
59     pub ugid_map: Option<(&'a str, &'a str)>,
60     /// The remount mode instead of default MS_PRIVATE.
61     pub remount_mode: Option<c_ulong>,
62     /// Whether or not to configure the jail to support bind-mounts.
63     ///
64     /// Note that most device processes deny `open(2)` and `openat(2)` by seccomp policy and just
65     /// returns `ENOENT`. Passing opened file descriptors is recommended over opening files in the
66     /// sandbox.
67     pub bind_mounts: bool,
68     /// Specify the user in the jail to run as.
69     pub run_as: RunAsUser,
70 }
71 
72 impl<'a> SandboxConfig<'a> {
73     /// Creates [SandboxConfig].
new(jail_config: &'a JailConfig, policy: &'a str) -> Self74     pub fn new(jail_config: &'a JailConfig, policy: &'a str) -> Self {
75         Self {
76             limit_caps: true,
77             log_failures: jail_config.seccomp_log_failures,
78             seccomp_policy_dir: jail_config.seccomp_policy_dir.as_ref().map(Path::new),
79             seccomp_policy_name: policy,
80             ugid_map: None,
81             remount_mode: None,
82             bind_mounts: false,
83             run_as: RunAsUser::Unspecified,
84         }
85     }
86 }
87 
88 /// Wrapper that cleans up a [Minijail] when it is dropped
89 pub struct ScopedMinijail(pub Minijail);
90 
91 impl Drop for ScopedMinijail {
drop(&mut self)92     fn drop(&mut self) {
93         let _ = self.0.kill();
94     }
95 }
96 
97 /// Creates a [Minijail] instance which just changes the root using pivot_root(2) path and
98 /// `max_open_files` using `RLIMIT_NOFILE`.
99 ///
100 /// If `root` path is "/", the minijail don't change the root.
101 ///
102 /// # Arguments
103 ///
104 /// * `root` - The root path to be changed to by minijail.
105 /// * `max_open_files` - The maximum number of file descriptors to allow a jailed process to open.
create_base_minijail(root: &Path, max_open_files: u64) -> Result<Minijail>106 pub fn create_base_minijail(root: &Path, max_open_files: u64) -> Result<Minijail> {
107     // Validate new root directory. Path::is_dir() also checks the existence.
108     if !root.is_dir() {
109         bail!("{:?} is not a directory, cannot create jail", root);
110     }
111     // chroot accepts absolute path only.
112     if !root.is_absolute() {
113         bail!("{:?} is not absolute path", root);
114     }
115 
116     // All child jails run in a new user namespace without any users mapped, they run as nobody
117     // unless otherwise configured.
118     let mut jail = Minijail::new().context("failed to jail device")?;
119 
120     // Only pivot_root if we are not re-using the current root directory.
121     if root != Path::new("/") {
122         // It's safe to call `namespace_vfs` multiple times.
123         jail.namespace_vfs();
124         jail.enter_pivot_root(root)
125             .context("failed to pivot root device")?;
126     }
127 
128     jail.set_rlimit(libc::RLIMIT_NOFILE as i32, max_open_files, max_open_files)
129         .context("error setting max open files")?;
130 
131     Ok(jail)
132 }
133 
134 /// Creates a [Minijail] instance which creates a sandbox.
135 ///
136 /// # Arguments
137 ///
138 /// * `root` - The root path to be changed to by minijail.
139 /// * `max_open_files` - The maximum number of file descriptors to allow a jailed process to open.
140 /// * `config` - The [SandboxConfig] to control details of the sandbox.
create_sandbox_minijail( root: &Path, max_open_files: u64, config: &SandboxConfig, ) -> Result<Minijail>141 pub fn create_sandbox_minijail(
142     root: &Path,
143     max_open_files: u64,
144     config: &SandboxConfig,
145 ) -> Result<Minijail> {
146     let mut jail = create_base_minijail(root, max_open_files)?;
147 
148     jail.namespace_pids();
149     jail.namespace_user();
150     jail.namespace_user_disable_setgroups();
151     if config.limit_caps {
152         // Don't need any capabilities.
153         jail.use_caps(0);
154     }
155     match config.run_as {
156         RunAsUser::Unspecified => {
157             if config.bind_mounts && config.ugid_map.is_none() {
158                 // Minijail requires to set user/group map to mount extra directories.
159                 add_current_user_to_jail(&mut jail)?;
160             }
161         }
162         RunAsUser::CurrentUser => {
163             add_current_user_to_jail(&mut jail)?;
164         }
165         RunAsUser::Root => {
166             // Add the current user as root in the jail.
167             let crosvm_uid = geteuid();
168             let crosvm_gid = getegid();
169             jail.uidmap(&format!("0 {} 1", crosvm_uid))
170                 .context("error setting UID map")?;
171             jail.gidmap(&format!("0 {} 1", crosvm_gid))
172                 .context("error setting GID map")?;
173         }
174     }
175     if config.bind_mounts {
176         // Create a tmpfs in the device's root directory so that we can bind mount files.
177         // The size=67108864 is size=64*1024*1024 or size=64MB.
178         // TODO(b/267581374): Use appropriate size for tmpfs.
179         jail.mount_with_data(
180             Path::new("none"),
181             Path::new("/"),
182             "tmpfs",
183             (libc::MS_NOSUID | libc::MS_NODEV | libc::MS_NOEXEC) as usize,
184             "size=67108864",
185         )?;
186     }
187     if let Some((uid_map, gid_map)) = config.ugid_map {
188         jail.uidmap(uid_map).context("error setting UID map")?;
189         jail.gidmap(gid_map).context("error setting GID map")?;
190     }
191     // Run in a new mount namespace.
192     jail.namespace_vfs();
193 
194     // Run in an empty network namespace.
195     jail.namespace_net();
196 
197     // Don't allow the device to gain new privileges.
198     jail.no_new_privs();
199 
200     #[cfg(feature = "seccomp_trace")]
201     {
202         #[repr(C)]
203         #[derive(AsBytes)]
204         struct sock_filter {
205             /* Filter block */
206             code: u16, /* Actual filter code */
207             jt: u8,    /* Jump true */
208             jf: u8,    /* Jump false */
209             k: u32,    /* Generic multiuse field */
210         }
211 
212         // BPF constant is defined in https://elixir.bootlin.com/linux/latest/source/include/uapi/linux/bpf_common.h
213         // BPF parser/assembler is defined in https://elixir.bootlin.com/linux/v4.9/source/tools/net/bpf_exp.y
214         const SECCOMP_RET_TRACE: u32 = 0x7ff00000;
215         const SECCOMP_RET_LOG: u32 = 0x7ffc0000;
216         const BPF_RET: u16 = 0x06;
217         const BPF_K: u16 = 0x00;
218 
219         // return SECCOMP_RET_LOG for all syscalls
220         const FILTER_RET_LOG_BLOCK: sock_filter = sock_filter {
221             code: BPF_RET | BPF_K,
222             jt: 0,
223             jf: 0,
224             k: SECCOMP_RET_LOG,
225         };
226 
227         warn!("The running crosvm is compiled with seccomp_trace feature, and is striclty used for debugging purpose only. DO NOT USE IN PRODUCTION!!!");
228         debug!(
229             "seccomp_trace {{\"event\": \"minijail_create\", \"name\": \"{}\", \"jail_addr\": \"0x{:x}\"}}",
230             config.seccomp_policy_name,
231             read_jail_addr(&jail),
232         );
233         jail.parse_seccomp_bytes(FILTER_RET_LOG_BLOCK.as_bytes())
234             .unwrap();
235     }
236 
237     #[cfg(not(feature = "seccomp_trace"))]
238     if let Some(seccomp_policy_dir) = config.seccomp_policy_dir {
239         let seccomp_policy_path = seccomp_policy_dir.join(config.seccomp_policy_name);
240         // By default we'll prioritize using the pre-compiled .bpf over the .policy file (the .bpf
241         // is expected to be compiled using "trap" as the failure behavior instead of the default
242         // "kill" behavior) when a policy path is supplied in the command line arugments. Otherwise
243         // the built-in pre-compiled policies will be used.
244         // Refer to the code comment for the "seccomp-log-failures" command-line parameter for an
245         // explanation about why the |log_failures| flag forces the use of .policy files (and the
246         // build-time alternative to this run-time flag).
247         let bpf_policy_file = seccomp_policy_path.with_extension("bpf");
248         if bpf_policy_file.exists() && !config.log_failures {
249             jail.parse_seccomp_program(&bpf_policy_file)
250                 .with_context(|| {
251                     format!(
252                         "failed to parse precompiled seccomp policy: {}",
253                         bpf_policy_file.display()
254                     )
255                 })?;
256         } else {
257             // Use TSYNC only for the side effect of it using SECCOMP_RET_TRAP, which will correctly
258             // kill the entire device process if a worker thread commits a seccomp violation.
259             jail.set_seccomp_filter_tsync();
260             if config.log_failures {
261                 jail.log_seccomp_filter_failures();
262             }
263             let bpf_policy_file = seccomp_policy_path.with_extension("policy");
264             jail.parse_seccomp_filters(&bpf_policy_file)
265                 .with_context(|| {
266                     format!(
267                         "failed to parse seccomp policy: {}",
268                         bpf_policy_file.display()
269                     )
270                 })?;
271         }
272     } else {
273         let bpf_program = EMBEDDED_BPFS
274             .get(&config.seccomp_policy_name)
275             .with_context(|| {
276                 format!(
277                     "failed to find embedded seccomp policy: {}",
278                     &config.seccomp_policy_name
279                 )
280             })?;
281         jail.parse_seccomp_bytes(bpf_program).with_context(|| {
282             format!(
283                 "failed to parse embedded seccomp policy: {}",
284                 &config.seccomp_policy_name
285             )
286         })?;
287     }
288 
289     jail.use_seccomp_filter();
290     // Don't do init setup.
291     jail.run_as_init();
292     // Set up requested remount mode instead of default MS_PRIVATE.
293     if let Some(mode) = config.remount_mode {
294         jail.set_remount_mode(mode);
295     }
296 
297     Ok(jail)
298 }
299 
300 /// Creates a basic [Minijail] if `jail_config` is present.
301 ///
302 /// Returns `None` if `jail_config` is none.
simple_jail(jail_config: &Option<JailConfig>, policy: &str) -> Result<Option<Minijail>>303 pub fn simple_jail(jail_config: &Option<JailConfig>, policy: &str) -> Result<Option<Minijail>> {
304     if let Some(jail_config) = jail_config {
305         let config = SandboxConfig::new(jail_config, policy);
306         Ok(Some(create_sandbox_minijail(
307             &jail_config.pivot_root,
308             MAX_OPEN_FILES_DEFAULT,
309             &config,
310         )?))
311     } else {
312         Ok(None)
313     }
314 }
315 
316 /// Creates [Minijail] for gpu processes.
create_gpu_minijail(root: &Path, config: &SandboxConfig) -> Result<Minijail>317 pub fn create_gpu_minijail(root: &Path, config: &SandboxConfig) -> Result<Minijail> {
318     let mut jail = create_sandbox_minijail(root, MAX_OPEN_FILES_FOR_GPU, config)?;
319 
320     // Device nodes required for DRM.
321     let sys_dev_char_path = Path::new("/sys/dev/char");
322     jail.mount_bind(sys_dev_char_path, sys_dev_char_path, false)?;
323 
324     // Necessary for CGROUP control of the vGPU threads
325     // This is not necessary UNLESS one wants to make use
326     // of the gpu cgroup command line options.
327     let sys_cpuset_path = Path::new("/sys/fs/cgroup/cpuset");
328     if sys_cpuset_path.exists() {
329         jail.mount_bind(sys_cpuset_path, sys_cpuset_path, true)?;
330     }
331 
332     let sys_devices_path = Path::new("/sys/devices");
333     jail.mount_bind(sys_devices_path, sys_devices_path, false)?;
334 
335     let drm_dri_path = Path::new("/dev/dri");
336     if drm_dri_path.exists() {
337         jail.mount_bind(drm_dri_path, drm_dri_path, false)?;
338     }
339 
340     // If the ARM specific devices exist on the host, bind mount them in.
341     let mali0_path = Path::new("/dev/mali0");
342     if mali0_path.exists() {
343         jail.mount_bind(mali0_path, mali0_path, true)?;
344     }
345 
346     let pvr_sync_path = Path::new("/dev/pvr_sync");
347     if pvr_sync_path.exists() {
348         jail.mount_bind(pvr_sync_path, pvr_sync_path, true)?;
349     }
350 
351     // If the udmabuf driver exists on the host, bind mount it in.
352     let udmabuf_path = Path::new("/dev/udmabuf");
353     if udmabuf_path.exists() {
354         jail.mount_bind(udmabuf_path, udmabuf_path, true)?;
355     }
356 
357     // Libraries that are required when mesa drivers are dynamically loaded.
358     jail_mount_bind_if_exists(
359         &mut jail,
360         &[
361             "/usr/lib",
362             "/usr/lib64",
363             "/lib",
364             "/lib64",
365             "/usr/share/drirc.d",
366             "/usr/share/glvnd",
367             "/usr/share/vulkan",
368         ],
369     )?;
370 
371     // pvr driver requires read access to /proc/self/task/*/comm.
372     mount_proc(&mut jail)?;
373 
374     // To enable perfetto tracing, we need to give access to the perfetto service IPC
375     // endpoints.
376     let perfetto_path = Path::new("/run/perfetto");
377     if perfetto_path.exists() {
378         jail.mount_bind(perfetto_path, perfetto_path, true)?;
379     }
380 
381     Ok(jail)
382 }
383 
384 /// Mirror-mount all the directories in `dirs` into `jail` on a best-effort basis.
385 ///
386 /// This function will not return an error if any of the directories in `dirs` is missing.
jail_mount_bind_if_exists<P: AsRef<std::ffi::OsStr>>( jail: &mut Minijail, dirs: &[P], ) -> Result<()>387 pub fn jail_mount_bind_if_exists<P: AsRef<std::ffi::OsStr>>(
388     jail: &mut Minijail,
389     dirs: &[P],
390 ) -> Result<()> {
391     for dir in dirs {
392         let dir_path = Path::new(dir);
393         if dir_path.exists() {
394             jail.mount_bind(dir_path, dir_path, false)?;
395         }
396     }
397 
398     Ok(())
399 }
400 
401 /// Mount proc in the sandbox.
mount_proc(jail: &mut Minijail) -> Result<()>402 pub fn mount_proc(jail: &mut Minijail) -> Result<()> {
403     jail.mount(
404         Path::new("proc"),
405         Path::new("/proc"),
406         "proc",
407         (libc::MS_NOSUID | libc::MS_NODEV | libc::MS_NOEXEC | libc::MS_RDONLY) as usize,
408     )?;
409     Ok(())
410 }
411 
412 /// Read minijail internal struct address for uniquely identifying and tracking jail's lifetime
413 #[cfg(feature = "seccomp_trace")]
read_jail_addr(jail: &Minijail) -> usize414 pub fn read_jail_addr(jail: &Minijail) -> usize {
415     // We can only hope minijail's rust object will always only contain a pointer to C jail struct
416     assert_eq_size!(Minijail, usize);
417     // Safe because it's only doing a read within bound checked by static assert
418     unsafe { *(jail as *const Minijail as *const usize) }
419 }
420 
421 /// Set the uid/gid for the jailed process and give a basic id map. This is
422 /// required for bind mounts to work.
add_current_user_to_jail(jail: &mut Minijail) -> Result<()>423 fn add_current_user_to_jail(jail: &mut Minijail) -> Result<()> {
424     let crosvm_uid = geteuid();
425     let crosvm_gid = getegid();
426 
427     jail.uidmap(&format!("{0} {0} 1", crosvm_uid))
428         .context("error setting UID map")?;
429     jail.gidmap(&format!("{0} {0} 1", crosvm_gid))
430         .context("error setting GID map")?;
431 
432     if crosvm_uid != 0 {
433         jail.change_uid(crosvm_uid);
434     }
435     if crosvm_gid != 0 {
436         jail.change_gid(crosvm_gid);
437     }
438     Ok(())
439 }
440