1 // Copyright 2017 The ChromiumOS Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #![deny(missing_docs)]
6 #![allow(dead_code)]
7
8 use std::path::Path;
9 use std::str;
10
11 use anyhow::bail;
12 use anyhow::Context;
13 use anyhow::Result;
14 #[cfg(feature = "seccomp_trace")]
15 use base::debug;
16 use base::getegid;
17 use base::geteuid;
18 #[cfg(feature = "seccomp_trace")]
19 use base::warn;
20 use libc::c_ulong;
21 use minijail::Minijail;
22 use once_cell::sync::Lazy;
23 #[cfg(feature = "seccomp_trace")]
24 use static_assertions::const_assert;
25 #[cfg(feature = "seccomp_trace")]
26 use zerocopy::Immutable;
27 #[cfg(feature = "seccomp_trace")]
28 use zerocopy::IntoBytes;
29
30 use crate::config::JailConfig;
31
32 // ANDROID: b/246968493
33 #[cfg(not(feature = "seccomp_trace"))]
34 static EMBEDDED_BPFS: Lazy<std::collections::HashMap<&str, Vec<u8>>> =
35 Lazy::new(|| std::collections::HashMap::<&str, Vec<u8>>::new());
36
37 /// Most devices don't need to open many fds.
38 pub const MAX_OPEN_FILES_DEFAULT: u64 = 1024;
39 /// The max open files for gpu processes.
40 const MAX_OPEN_FILES_FOR_GPU: u64 = 32768;
41 /// The max open files for jail warden, matching FD_RAW_FAILURE.
42 pub const MAX_OPEN_FILES_FOR_JAIL_WARDEN: u64 = 65536;
43
44 /// The user in the jail to run as.
45 pub enum RunAsUser {
46 /// Do not specify the user
47 Unspecified,
48 /// Runs as the same user in the jail as the current user.
49 CurrentUser,
50 /// Runs as the root user in the jail.
51 Root,
52 /// Runs as the specified uid and gid.
53 /// This requires `SandboxConfig::ugid_map` to be set.
54 Specified(u32, u32),
55 }
56
57 /// Config for the sandbox to be created by [Minijail].
58 pub struct SandboxConfig<'a> {
59 /// Whether or not to drop all capabilities in the sandbox.
60 pub limit_caps: bool,
61 log_failures: bool,
62 seccomp_policy_dir: Option<&'a Path>,
63 seccomp_policy_name: &'a str,
64 /// The pair of `uid_map` and `gid_map`.
65 pub ugid_map: Option<(&'a str, &'a str)>,
66 /// The remount mode instead of default MS_PRIVATE.
67 pub remount_mode: Option<c_ulong>,
68 /// Whether to use empty net namespace. Enabled by default.
69 pub namespace_net: bool,
70 /// Whether or not to configure the jail to support bind-mounts.
71 ///
72 /// Note that most device processes deny `open(2)` and `openat(2)` by seccomp policy and just
73 /// returns `ENOENT`. Passing opened file descriptors is recommended over opening files in the
74 /// sandbox.
75 pub bind_mounts: bool,
76 /// Specify the user in the jail to run as.
77 pub run_as: RunAsUser,
78 }
79
80 impl<'a> SandboxConfig<'a> {
81 /// Creates [SandboxConfig].
new(jail_config: &'a JailConfig, policy: &'a str) -> Self82 pub fn new(jail_config: &'a JailConfig, policy: &'a str) -> Self {
83 Self {
84 limit_caps: true,
85 log_failures: jail_config.seccomp_log_failures,
86 seccomp_policy_dir: jail_config.seccomp_policy_dir.as_ref().map(Path::new),
87 seccomp_policy_name: policy,
88 ugid_map: None,
89 remount_mode: None,
90 namespace_net: true,
91 bind_mounts: false,
92 run_as: RunAsUser::Unspecified,
93 }
94 }
95 }
96
97 /// Wrapper that cleans up a [Minijail] when it is dropped
98 pub struct ScopedMinijail(pub Minijail);
99
100 impl Drop for ScopedMinijail {
drop(&mut self)101 fn drop(&mut self) {
102 let _ = self.0.kill();
103 }
104 }
105
106 /// Creates a default Minijail instance with no configuration.
create_default_minijail() -> minijail::Result<Minijail>107 pub fn create_default_minijail() -> minijail::Result<Minijail> {
108 Minijail::new().map(|mut jail| {
109 // Temporarily disable multithreaded check due to a regression in linux 6.12.5
110 // TODO(b/395899741): Remove after kernel upstream is fixed.
111 jail.disable_multithreaded_check();
112 jail
113 })
114 }
115
116 /// Creates a [Minijail] instance which just changes the root using pivot_root(2) path and
117 /// `max_open_files` using `RLIMIT_NOFILE`.
118 ///
119 /// If `root` path is "/", the minijail don't change the root.
120 ///
121 /// # Arguments
122 ///
123 /// * `root` - The root path to be changed to by minijail.
124 /// * `max_open_files` - The maximum number of file descriptors to allow a jailed process to open.
125 #[allow(clippy::unnecessary_cast)]
create_base_minijail(root: &Path, max_open_files: u64) -> Result<Minijail>126 pub fn create_base_minijail(root: &Path, max_open_files: u64) -> Result<Minijail> {
127 // Validate new root directory. Path::is_dir() also checks the existence.
128 if !root.is_dir() {
129 bail!("{:?} is not a directory, cannot create jail", root);
130 }
131 // chroot accepts absolute path only.
132 if !root.is_absolute() {
133 bail!("{:?} is not absolute path", root);
134 }
135
136 let mut jail = create_default_minijail().context("failed to jail device")?;
137
138 // Only pivot_root if we are not re-using the current root directory.
139 if root != Path::new("/") {
140 // Run in a new mount namespace.
141 jail.namespace_vfs();
142 jail.enter_pivot_root(root)
143 .context("failed to pivot root device")?;
144 }
145
146 jail.set_rlimit(libc::RLIMIT_NOFILE as i32, max_open_files, max_open_files)
147 .context("error setting max open files")?;
148
149 Ok(jail)
150 }
151
152 /// Creates a [Minijail] instance which just invokes a jail process and sets
153 /// `max_open_files` using `RLIMIT_NOFILE`. This is helpful with crosvm process
154 /// runs as a non-root user without SYS_ADMIN capabilities.
155 ///
156 /// Unlike `create_base_minijail`, this function doesn't call `pivot_root`
157 /// and `mount namespace`. So, it runs as a non-root user without
158 /// SYS_ADMIN capabilities.
159 ///
160 /// Note that since there is no file system isolation provided by this function,
161 /// caller of this function should enforce other security mechanisum such as selinux
162 /// on the host to protect directories.
163 ///
164 /// # Arguments
165 ///
166 /// * `root` - The root path to checked before the process is jailed
167 /// * `max_open_files` - The maximum number of file descriptors to allow a jailed process to open.
168 #[allow(clippy::unnecessary_cast)]
create_base_minijail_without_pivot_root( root: &Path, max_open_files: u64, ) -> Result<Minijail>169 pub fn create_base_minijail_without_pivot_root(
170 root: &Path,
171 max_open_files: u64,
172 ) -> Result<Minijail> {
173 // Validate new root directory. Path::is_dir() also checks the existence.
174 if !root.is_dir() {
175 bail!("{:?} is not a directory, cannot create jail", root);
176 }
177 if !root.is_absolute() {
178 bail!("{:?} is not absolute path", root);
179 }
180
181 let mut jail = create_default_minijail().context("failed to jail device")?;
182 jail.set_rlimit(libc::RLIMIT_NOFILE as i32, max_open_files, max_open_files)
183 .context("error setting max open files")?;
184
185 Ok(jail)
186 }
187
188 /// Creates a [Minijail] instance which creates a sandbox.
189 ///
190 /// # Arguments
191 ///
192 /// * `root` - The root path to be changed to by minijail.
193 /// * `max_open_files` - The maximum number of file descriptors to allow a jailed process to open.
194 /// * `config` - The [SandboxConfig] to control details of the sandbox.
create_sandbox_minijail( root: &Path, max_open_files: u64, config: &SandboxConfig, ) -> Result<Minijail>195 pub fn create_sandbox_minijail(
196 root: &Path,
197 max_open_files: u64,
198 config: &SandboxConfig,
199 ) -> Result<Minijail> {
200 let mut jail = create_base_minijail(root, max_open_files)?;
201
202 jail.namespace_pids();
203 jail.namespace_user();
204 jail.namespace_user_disable_setgroups();
205 if config.limit_caps {
206 // Don't need any capabilities.
207 jail.use_caps(0);
208 }
209 match config.run_as {
210 RunAsUser::Unspecified => {
211 if config.bind_mounts && config.ugid_map.is_none() {
212 // Minijail requires to set user/group map to mount extra directories.
213 add_current_user_to_jail(&mut jail)?;
214 }
215 }
216 RunAsUser::CurrentUser => {
217 add_current_user_to_jail(&mut jail)?;
218 }
219 RunAsUser::Root => {
220 // Add the current user as root in the jail.
221 let crosvm_uid = geteuid();
222 let crosvm_gid = getegid();
223 jail.uidmap(&format!("0 {} 1", crosvm_uid))
224 .context("error setting UID map")?;
225 jail.gidmap(&format!("0 {} 1", crosvm_gid))
226 .context("error setting GID map")?;
227 }
228 RunAsUser::Specified(uid, gid) => {
229 if uid != 0 {
230 jail.change_uid(uid)
231 }
232 if gid != 0 {
233 jail.change_gid(gid)
234 }
235 }
236 }
237 if config.bind_mounts {
238 // Create a tmpfs in the device's root directory so that we can bind mount files.
239 // The size=67108864 is size=64*1024*1024 or size=64MB.
240 // TODO(b/267581374): Use appropriate size for tmpfs.
241 jail.mount_with_data(
242 Path::new("none"),
243 Path::new("/"),
244 "tmpfs",
245 (libc::MS_NOSUID | libc::MS_NODEV | libc::MS_NOEXEC) as usize,
246 "size=67108864",
247 )?;
248 }
249 if let Some((uid_map, gid_map)) = config.ugid_map {
250 jail.uidmap(uid_map).context("error setting UID map")?;
251 jail.gidmap(gid_map).context("error setting GID map")?;
252 }
253 // Run in a new mount namespace.
254 jail.namespace_vfs();
255
256 if config.namespace_net {
257 // Run in an empty network namespace.
258 jail.namespace_net();
259 }
260
261 // Don't allow the device to gain new privileges.
262 jail.no_new_privs();
263
264 #[cfg(feature = "seccomp_trace")]
265 {
266 #[repr(C)]
267 #[derive(Immutable, IntoBytes)]
268 struct sock_filter {
269 /* Filter block */
270 code: u16, /* Actual filter code */
271 jt: u8, /* Jump true */
272 jf: u8, /* Jump false */
273 k: u32, /* Generic multiuse field */
274 }
275
276 // BPF constant is defined in https://elixir.bootlin.com/linux/latest/source/include/uapi/linux/bpf_common.h
277 // BPF parser/assembler is defined in https://elixir.bootlin.com/linux/v4.9/source/tools/net/bpf_exp.y
278 const SECCOMP_RET_TRACE: u32 = 0x7ff00000;
279 const SECCOMP_RET_LOG: u32 = 0x7ffc0000;
280 const BPF_RET: u16 = 0x06;
281 const BPF_K: u16 = 0x00;
282
283 // return SECCOMP_RET_LOG for all syscalls
284 const FILTER_RET_LOG_BLOCK: sock_filter = sock_filter {
285 code: BPF_RET | BPF_K,
286 jt: 0,
287 jf: 0,
288 k: SECCOMP_RET_LOG,
289 };
290
291 warn!("The running crosvm is compiled with seccomp_trace feature, and is striclty used for debugging purpose only. DO NOT USE IN PRODUCTION!!!");
292 debug!(
293 "seccomp_trace {{\"event\": \"minijail_create\", \"name\": \"{}\", \"jail_addr\": \"0x{:x}\"}}",
294 config.seccomp_policy_name,
295 read_jail_addr(&jail),
296 );
297 jail.parse_seccomp_bytes(FILTER_RET_LOG_BLOCK.as_bytes())
298 .unwrap();
299 }
300
301 #[cfg(not(feature = "seccomp_trace"))]
302 if let Some(seccomp_policy_dir) = config.seccomp_policy_dir {
303 let seccomp_policy_path = seccomp_policy_dir.join(config.seccomp_policy_name);
304 // By default we'll prioritize using the pre-compiled .bpf over the .policy file (the .bpf
305 // is expected to be compiled using "trap" as the failure behavior instead of the default
306 // "kill" behavior) when a policy path is supplied in the command line arugments. Otherwise
307 // the built-in pre-compiled policies will be used.
308 // Refer to the code comment for the "seccomp-log-failures" command-line parameter for an
309 // explanation about why the |log_failures| flag forces the use of .policy files (and the
310 // build-time alternative to this run-time flag).
311 let bpf_policy_file = seccomp_policy_path.with_extension("bpf");
312 if bpf_policy_file.exists() && !config.log_failures {
313 jail.parse_seccomp_program(&bpf_policy_file)
314 .with_context(|| {
315 format!(
316 "failed to parse precompiled seccomp policy: {}",
317 bpf_policy_file.display()
318 )
319 })?;
320 } else {
321 // Use TSYNC only for the side effect of it using SECCOMP_RET_TRAP, which will correctly
322 // kill the entire device process if a worker thread commits a seccomp violation.
323 jail.set_seccomp_filter_tsync();
324 if config.log_failures {
325 jail.log_seccomp_filter_failures();
326 }
327 let bpf_policy_file = seccomp_policy_path.with_extension("policy");
328 jail.parse_seccomp_filters(&bpf_policy_file)
329 .with_context(|| {
330 format!(
331 "failed to parse seccomp policy: {}",
332 bpf_policy_file.display()
333 )
334 })?;
335 }
336 } else {
337 set_embedded_bpf_program(&mut jail, config.seccomp_policy_name)?;
338 }
339
340 jail.use_seccomp_filter();
341 // Don't do init setup.
342 jail.run_as_init();
343 // Set up requested remount mode instead of default MS_PRIVATE.
344 if let Some(mode) = config.remount_mode {
345 jail.set_remount_mode(mode);
346 }
347
348 Ok(jail)
349 }
350
351 /// Creates a basic [Minijail] if `jail_config` is present.
352 ///
353 /// Returns `None` if `jail_config` is none.
simple_jail(jail_config: Option<&JailConfig>, policy: &str) -> Result<Option<Minijail>>354 pub fn simple_jail(jail_config: Option<&JailConfig>, policy: &str) -> Result<Option<Minijail>> {
355 if let Some(jail_config) = jail_config {
356 let config = SandboxConfig::new(jail_config, policy);
357 Ok(Some(create_sandbox_minijail(
358 &jail_config.pivot_root,
359 MAX_OPEN_FILES_DEFAULT,
360 &config,
361 )?))
362 } else {
363 Ok(None)
364 }
365 }
366
367 /// Creates [Minijail] for gpu processes.
create_gpu_minijail( root: &Path, config: &SandboxConfig, render_node_only: bool, snapshot_scratch_directory: Option<&Path>, ) -> Result<Minijail>368 pub fn create_gpu_minijail(
369 root: &Path,
370 config: &SandboxConfig,
371 render_node_only: bool,
372 snapshot_scratch_directory: Option<&Path>,
373 ) -> Result<Minijail> {
374 let mut jail = create_sandbox_minijail(root, MAX_OPEN_FILES_FOR_GPU, config)?;
375
376 // Device nodes required for DRM.
377 let sys_dev_char_path = Path::new("/sys/dev/char");
378 jail.mount_bind(sys_dev_char_path, sys_dev_char_path, false)?;
379
380 // Necessary for CGROUP control of the vGPU threads
381 // This is not necessary UNLESS one wants to make use
382 // of the gpu cgroup command line options.
383 let sys_cpuset_path = Path::new("/sys/fs/cgroup/cpuset");
384 if sys_cpuset_path.exists() {
385 jail.mount_bind(sys_cpuset_path, sys_cpuset_path, true)?;
386 }
387
388 let sys_devices_path = Path::new("/sys/devices");
389 jail.mount_bind(sys_devices_path, sys_devices_path, false)?;
390
391 jail_mount_bind_drm(&mut jail, render_node_only)?;
392
393 // If the ARM specific devices exist on the host, bind mount them in.
394 let mali0_path = Path::new("/dev/mali0");
395 if mali0_path.exists() {
396 jail.mount_bind(mali0_path, mali0_path, true)?;
397 }
398
399 let pvr_sync_path = Path::new("/dev/pvr_sync");
400 if pvr_sync_path.exists() {
401 jail.mount_bind(pvr_sync_path, pvr_sync_path, true)?;
402 }
403
404 // If the udmabuf driver exists on the host, bind mount it in.
405 let udmabuf_path = Path::new("/dev/udmabuf");
406 if udmabuf_path.exists() {
407 jail.mount_bind(udmabuf_path, udmabuf_path, true)?;
408 }
409
410 // Libraries that are required when mesa drivers are dynamically loaded.
411 jail_mount_bind_if_exists(
412 &mut jail,
413 &[
414 "/usr/lib",
415 "/usr/lib64",
416 "/lib",
417 "/lib64",
418 "/usr/share/drirc.d",
419 "/usr/share/glvnd",
420 "/usr/share/libdrm",
421 "/usr/share/vulkan",
422 ],
423 )?;
424
425 // pvr driver requires read access to /proc/self/task/*/comm.
426 mount_proc(&mut jail)?;
427
428 // To enable perfetto tracing, we need to give access to the perfetto service IPC
429 // endpoints.
430 let perfetto_path = Path::new("/run/perfetto");
431 if perfetto_path.exists() {
432 jail.mount_bind(perfetto_path, perfetto_path, true)?;
433 }
434
435 // Provide scratch space for the GPU device to build or unpack snapshots.
436 if let Some(snapshot_scratch_directory) = snapshot_scratch_directory {
437 jail.mount_with_data(
438 Path::new("none"),
439 snapshot_scratch_directory,
440 "tmpfs",
441 (libc::MS_NOSUID | libc::MS_NODEV | libc::MS_NOEXEC) as usize,
442 "size=4294967296",
443 )?;
444 }
445
446 Ok(jail)
447 }
448
449 /// Selectively bind mount drm nodes into `jail` based on `render_node_only`
450 ///
451 /// This function will not return an error if drm nodes don't exist
jail_mount_bind_drm(jail: &mut Minijail, render_node_only: bool) -> Result<()>452 pub fn jail_mount_bind_drm(jail: &mut Minijail, render_node_only: bool) -> Result<()> {
453 if render_node_only {
454 const DRM_NUM_NODES: u32 = 63;
455 const DRM_RENDER_NODE_START: u32 = 128;
456 for offset in 0..DRM_NUM_NODES {
457 let path_str = format!("/dev/dri/renderD{}", DRM_RENDER_NODE_START + offset);
458 let drm_dri_path = Path::new(&path_str);
459 if !drm_dri_path.exists() {
460 break;
461 }
462 jail.mount_bind(drm_dri_path, drm_dri_path, false)?;
463 }
464 } else {
465 let drm_dri_path = Path::new("/dev/dri");
466 if drm_dri_path.exists() {
467 jail.mount_bind(drm_dri_path, drm_dri_path, false)?;
468 }
469 }
470
471 Ok(())
472 }
473
474 /// Mirror-mount all the directories in `dirs` into `jail` on a best-effort basis.
475 ///
476 /// This function will not return an error if any of the directories in `dirs` is missing.
jail_mount_bind_if_exists<P: AsRef<std::ffi::OsStr>>( jail: &mut Minijail, dirs: &[P], ) -> Result<()>477 pub fn jail_mount_bind_if_exists<P: AsRef<std::ffi::OsStr>>(
478 jail: &mut Minijail,
479 dirs: &[P],
480 ) -> Result<()> {
481 for dir in dirs {
482 let dir_path = Path::new(dir);
483 if dir_path.exists() {
484 jail.mount_bind(dir_path, dir_path, false)?;
485 }
486 }
487
488 Ok(())
489 }
490
491 /// Mount proc in the sandbox.
mount_proc(jail: &mut Minijail) -> Result<()>492 pub fn mount_proc(jail: &mut Minijail) -> Result<()> {
493 jail.mount(
494 Path::new("proc"),
495 Path::new("/proc"),
496 "proc",
497 (libc::MS_NOSUID | libc::MS_NODEV | libc::MS_NOEXEC | libc::MS_RDONLY) as usize,
498 )?;
499 Ok(())
500 }
501
502 /// Read minijail internal struct address for uniquely identifying and tracking jail's lifetime
503 #[cfg(feature = "seccomp_trace")]
read_jail_addr(jail: &Minijail) -> usize504 pub fn read_jail_addr(jail: &Minijail) -> usize {
505 // We can only hope minijail's rust object will always contain a pointer to C jail struct as the
506 // first field.
507 const_assert!(std::mem::size_of::<Minijail>() >= std::mem::size_of::<usize>());
508 // Safe because it's only doing a read within bound checked by static assert
509 unsafe { *(jail as *const Minijail as *const usize) }
510 }
511
512 /// Set the uid/gid for the jailed process and give a basic id map. This is
513 /// required for bind mounts to work.
add_current_user_to_jail(jail: &mut Minijail) -> Result<()>514 fn add_current_user_to_jail(jail: &mut Minijail) -> Result<()> {
515 let crosvm_uid = geteuid();
516 let crosvm_gid = getegid();
517
518 jail.uidmap(&format!("{0} {0} 1", crosvm_uid))
519 .context("error setting UID map")?;
520 jail.gidmap(&format!("{0} {0} 1", crosvm_gid))
521 .context("error setting GID map")?;
522
523 if crosvm_uid != 0 {
524 jail.change_uid(crosvm_uid);
525 }
526 if crosvm_gid != 0 {
527 jail.change_gid(crosvm_gid);
528 }
529 Ok(())
530 }
531
532 /// Set the seccomp policy for a jail from embedded bpfs
set_embedded_bpf_program(jail: &mut Minijail, seccomp_policy_name: &str) -> Result<()>533 pub fn set_embedded_bpf_program(jail: &mut Minijail, seccomp_policy_name: &str) -> Result<()> {
534 let bpf_program = EMBEDDED_BPFS.get(seccomp_policy_name).with_context(|| {
535 format!(
536 "failed to find embedded seccomp policy: {}",
537 seccomp_policy_name
538 )
539 })?;
540 jail.parse_seccomp_bytes(bpf_program).with_context(|| {
541 format!(
542 "failed to parse embedded seccomp policy: {}",
543 seccomp_policy_name
544 )
545 })?;
546 Ok(())
547 }
548