1 // Copyright 2019 The Chromium OS Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 use std::{
6 borrow::Cow,
7 cmp,
8 collections::{btree_map, BTreeMap},
9 ffi::{CStr, CString},
10 fs::File,
11 io,
12 mem::{self, size_of, MaybeUninit},
13 os::raw::{c_int, c_long},
14 ptr::{addr_of, addr_of_mut},
15 str::FromStr,
16 sync::{
17 atomic::{AtomicBool, AtomicU64, Ordering},
18 Arc,
19 },
20 time::Duration,
21 };
22
23 use base::{
24 error, ioctl_ior_nr, ioctl_iow_nr, ioctl_iowr_nr, ioctl_with_mut_ptr, ioctl_with_ptr, syscall,
25 AsRawDescriptor, FileFlags, FromRawDescriptor, RawDescriptor,
26 };
27 use data_model::DataInit;
28 use fuse::filesystem::{
29 Context, DirectoryIterator, Entry, FileSystem, FsOptions, GetxattrReply, IoctlFlags,
30 IoctlReply, ListxattrReply, OpenOptions, RemoveMappingOne, SetattrValid, ZeroCopyReader,
31 ZeroCopyWriter, ROOT_ID,
32 };
33 use fuse::sys::WRITE_KILL_PRIV;
34 use fuse::Mapper;
35 use sync::Mutex;
36
37 #[cfg(feature = "chromeos")]
38 use {
39 protobuf::Message,
40 system_api::client::OrgChromiumArcQuota,
41 system_api::UserDataAuth::{
42 SetMediaRWDataFileProjectIdReply, SetMediaRWDataFileProjectIdRequest,
43 },
44 };
45
46 use crate::virtio::fs::caps::{Capability, Caps, Set as CapSet, Value as CapValue};
47 use crate::virtio::fs::multikey::MultikeyBTreeMap;
48 use crate::virtio::fs::read_dir::ReadDir;
49
50 const EMPTY_CSTR: &[u8] = b"\0";
51 const ROOT_CSTR: &[u8] = b"/\0";
52 const PROC_CSTR: &[u8] = b"/proc\0";
53
54 const USER_VIRTIOFS_XATTR: &[u8] = b"user.virtiofs.";
55 const SECURITY_XATTR: &[u8] = b"security.";
56 const SELINUX_XATTR: &[u8] = b"security.selinux";
57
58 const FSCRYPT_KEY_DESCRIPTOR_SIZE: usize = 8;
59 const FSCRYPT_KEY_IDENTIFIER_SIZE: usize = 16;
60
61 // 25 seconds is the default timeout for dbus-send.
62 #[cfg(feature = "chromeos")]
63 const DEFAULT_DBUS_TIMEOUT: Duration = Duration::from_secs(25);
64
65 #[repr(C)]
66 #[derive(Clone, Copy)]
67 struct fscrypt_policy_v1 {
68 _version: u8,
69 _contents_encryption_mode: u8,
70 _filenames_encryption_mode: u8,
71 _flags: u8,
72 _master_key_descriptor: [u8; FSCRYPT_KEY_DESCRIPTOR_SIZE],
73 }
74 unsafe impl DataInit for fscrypt_policy_v1 {}
75
76 #[repr(C)]
77 #[derive(Clone, Copy)]
78 struct fscrypt_policy_v2 {
79 _version: u8,
80 _contents_encryption_mode: u8,
81 _filenames_encryption_mode: u8,
82 _flags: u8,
83 __reserved: [u8; 4],
84 master_key_identifier: [u8; FSCRYPT_KEY_IDENTIFIER_SIZE],
85 }
86 unsafe impl DataInit for fscrypt_policy_v2 {}
87
88 #[repr(C)]
89 #[derive(Copy, Clone)]
90 union fscrypt_policy {
91 _version: u8,
92 _v1: fscrypt_policy_v1,
93 _v2: fscrypt_policy_v2,
94 }
95 unsafe impl DataInit for fscrypt_policy {}
96
97 #[repr(C)]
98 #[derive(Copy, Clone)]
99 struct fscrypt_get_policy_ex_arg {
100 policy_size: u64, /* input/output */
101 policy: fscrypt_policy, /* output */
102 }
103 unsafe impl DataInit for fscrypt_get_policy_ex_arg {}
104
105 ioctl_iowr_nr!(FS_IOC_GET_ENCRYPTION_POLICY_EX, 'f' as u32, 22, [u8; 9]);
106
107 #[repr(C)]
108 #[derive(Clone, Copy)]
109 struct fsxattr {
110 fsx_xflags: u32, /* xflags field value (get/set) */
111 fsx_extsize: u32, /* extsize field value (get/set)*/
112 fsx_nextents: u32, /* nextents field value (get) */
113 fsx_projid: u32, /* project identifier (get/set) */
114 fsx_cowextsize: u32, /* CoW extsize field value (get/set)*/
115 fsx_pad: [u8; 8],
116 }
117 unsafe impl DataInit for fsxattr {}
118
119 ioctl_ior_nr!(FS_IOC_FSGETXATTR, 'X' as u32, 31, fsxattr);
120 ioctl_iow_nr!(FS_IOC_FSSETXATTR, 'X' as u32, 32, fsxattr);
121
122 ioctl_ior_nr!(FS_IOC_GETFLAGS, 'f' as u32, 1, c_long);
123 ioctl_iow_nr!(FS_IOC_SETFLAGS, 'f' as u32, 2, c_long);
124
125 ioctl_ior_nr!(FS_IOC32_GETFLAGS, 'f' as u32, 1, u32);
126 ioctl_iow_nr!(FS_IOC32_SETFLAGS, 'f' as u32, 2, u32);
127
128 ioctl_ior_nr!(FS_IOC64_GETFLAGS, 'f' as u32, 1, u64);
129 ioctl_iow_nr!(FS_IOC64_SETFLAGS, 'f' as u32, 2, u64);
130
131 #[repr(C)]
132 #[derive(Clone, Copy)]
133 struct fsverity_enable_arg {
134 _version: u32,
135 _hash_algorithm: u32,
136 _block_size: u32,
137 salt_size: u32,
138 salt_ptr: u64,
139 sig_size: u32,
140 __reserved1: u32,
141 sig_ptr: u64,
142 __reserved2: [u64; 11],
143 }
144 unsafe impl DataInit for fsverity_enable_arg {}
145
146 #[repr(C)]
147 #[derive(Clone, Copy)]
148 struct fsverity_digest {
149 _digest_algorithm: u16,
150 digest_size: u16,
151 // __u8 digest[];
152 }
153 unsafe impl DataInit for fsverity_digest {}
154
155 ioctl_iow_nr!(FS_IOC_ENABLE_VERITY, 'f' as u32, 133, fsverity_enable_arg);
156 ioctl_iowr_nr!(FS_IOC_MEASURE_VERITY, 'f' as u32, 134, fsverity_digest);
157
158 type Inode = u64;
159 type Handle = u64;
160
161 #[derive(Clone, Copy, PartialOrd, Ord, PartialEq, Eq)]
162 struct InodeAltKey {
163 ino: libc::ino64_t,
164 dev: libc::dev_t,
165 }
166
167 #[derive(PartialEq, Eq)]
168 enum FileType {
169 Regular,
170 Directory,
171 Other,
172 }
173
174 impl From<libc::mode_t> for FileType {
from(mode: libc::mode_t) -> Self175 fn from(mode: libc::mode_t) -> Self {
176 match mode & libc::S_IFMT {
177 libc::S_IFREG => FileType::Regular,
178 libc::S_IFDIR => FileType::Directory,
179 _ => FileType::Other,
180 }
181 }
182 }
183
184 struct InodeData {
185 inode: Inode,
186 // (File, open_flags)
187 file: Mutex<(File, libc::c_int)>,
188 refcount: AtomicU64,
189 filetype: FileType,
190 }
191
192 impl AsRawDescriptor for InodeData {
as_raw_descriptor(&self) -> RawDescriptor193 fn as_raw_descriptor(&self) -> RawDescriptor {
194 self.file.lock().0.as_raw_descriptor()
195 }
196 }
197
198 struct HandleData {
199 inode: Inode,
200 file: Mutex<File>,
201 }
202
203 impl AsRawDescriptor for HandleData {
as_raw_descriptor(&self) -> RawDescriptor204 fn as_raw_descriptor(&self) -> RawDescriptor {
205 self.file.lock().as_raw_descriptor()
206 }
207 }
208
209 macro_rules! scoped_cred {
210 ($name:ident, $ty:ty, $syscall_nr:expr) => {
211 #[derive(Debug)]
212 struct $name {
213 old: $ty,
214 }
215
216 impl $name {
217 // Changes the effective uid/gid of the current thread to `val`. Changes the thread's
218 // credentials back to `old` when the returned struct is dropped.
219 fn new(val: $ty, old: $ty) -> io::Result<Option<$name>> {
220 if val == old {
221 // Nothing to do since we already have the correct value.
222 return Ok(None);
223 }
224
225 // We want credential changes to be per-thread because otherwise
226 // we might interfere with operations being carried out on other
227 // threads with different uids/gids. However, posix requires that
228 // all threads in a process share the same credentials. To do this
229 // libc uses signals to ensure that when one thread changes its
230 // credentials the other threads do the same thing.
231 //
232 // So instead we invoke the syscall directly in order to get around
233 // this limitation. Another option is to use the setfsuid and
234 // setfsgid systems calls. However since those calls have no way to
235 // return an error, it's preferable to do this instead.
236
237 // This call is safe because it doesn't modify any memory and we
238 // check the return value.
239 let res = unsafe { libc::syscall($syscall_nr, -1, val, -1) };
240 if res == 0 {
241 Ok(Some($name { old }))
242 } else {
243 Err(io::Error::last_os_error())
244 }
245 }
246 }
247
248 impl Drop for $name {
249 fn drop(&mut self) {
250 let res = unsafe { libc::syscall($syscall_nr, -1, self.old, -1) };
251 if res < 0 {
252 error!(
253 "failed to change credentials back to {}: {}",
254 self.old,
255 io::Error::last_os_error(),
256 );
257 }
258 }
259 }
260 };
261 }
262 #[cfg(not(target_arch = "arm"))]
263 scoped_cred!(ScopedUid, libc::uid_t, libc::SYS_setresuid);
264 #[cfg(target_arch = "arm")]
265 scoped_cred!(ScopedUid, libc::uid_t, libc::SYS_setresuid32);
266
267 #[cfg(not(target_arch = "arm"))]
268 scoped_cred!(ScopedGid, libc::gid_t, libc::SYS_setresgid);
269 #[cfg(target_arch = "arm")]
270 scoped_cred!(ScopedGid, libc::gid_t, libc::SYS_setresgid32);
271
272 #[cfg(not(target_arch = "arm"))]
273 const SYS_GETEUID: libc::c_long = libc::SYS_geteuid;
274 #[cfg(target_arch = "arm")]
275 const SYS_GETEUID: libc::c_long = libc::SYS_geteuid32;
276
277 #[cfg(not(target_arch = "arm"))]
278 const SYS_GETEGID: libc::c_long = libc::SYS_getegid;
279 #[cfg(target_arch = "arm")]
280 const SYS_GETEGID: libc::c_long = libc::SYS_getegid32;
281
282 thread_local! {
283 // Both these calls are safe because they take no parameters, and only return an integer value.
284 // The kernel also guarantees that they can never fail.
285 static THREAD_EUID: libc::uid_t = unsafe { libc::syscall(SYS_GETEUID) as libc::uid_t };
286 static THREAD_EGID: libc::gid_t = unsafe { libc::syscall(SYS_GETEGID) as libc::gid_t };
287 }
288
set_creds( uid: libc::uid_t, gid: libc::gid_t, ) -> io::Result<(Option<ScopedUid>, Option<ScopedGid>)>289 fn set_creds(
290 uid: libc::uid_t,
291 gid: libc::gid_t,
292 ) -> io::Result<(Option<ScopedUid>, Option<ScopedGid>)> {
293 let olduid = THREAD_EUID.with(|uid| *uid);
294 let oldgid = THREAD_EGID.with(|gid| *gid);
295
296 // We have to change the gid before we change the uid because if we change the uid first then we
297 // lose the capability to change the gid. However changing back can happen in any order.
298 ScopedGid::new(gid, oldgid).and_then(|gid| Ok((ScopedUid::new(uid, olduid)?, gid)))
299 }
300
301 struct ScopedUmask {
302 old: libc::mode_t,
303 mask: libc::mode_t,
304 }
305
306 impl ScopedUmask {
new(mask: libc::mode_t) -> ScopedUmask307 fn new(mask: libc::mode_t) -> ScopedUmask {
308 ScopedUmask {
309 // Safe because this doesn't modify any memory and always succeeds.
310 old: unsafe { libc::umask(mask) },
311 mask,
312 }
313 }
314 }
315
316 impl Drop for ScopedUmask {
drop(&mut self)317 fn drop(&mut self) {
318 // Safe because this doesn't modify any memory and always succeeds.
319 let previous = unsafe { libc::umask(self.old) };
320 debug_assert_eq!(
321 previous, self.mask,
322 "umask changed while holding ScopedUmask"
323 );
324 }
325 }
326
327 struct ScopedFsetid(Caps);
328 impl Drop for ScopedFsetid {
drop(&mut self)329 fn drop(&mut self) {
330 if let Err(e) = raise_cap_fsetid(&mut self.0) {
331 error!(
332 "Failed to restore CAP_FSETID: {}. Some operations may be broken.",
333 e
334 )
335 }
336 }
337 }
338
raise_cap_fsetid(c: &mut Caps) -> io::Result<()>339 fn raise_cap_fsetid(c: &mut Caps) -> io::Result<()> {
340 c.update(&[Capability::Fsetid], CapSet::Effective, CapValue::Set)?;
341 c.apply()
342 }
343
344 // Drops CAP_FSETID from the effective set for the current thread and returns an RAII guard that
345 // adds the capability back when it is dropped.
drop_cap_fsetid() -> io::Result<ScopedFsetid>346 fn drop_cap_fsetid() -> io::Result<ScopedFsetid> {
347 let mut caps = Caps::for_current_thread()?;
348 caps.update(&[Capability::Fsetid], CapSet::Effective, CapValue::Clear)?;
349 caps.apply()?;
350 Ok(ScopedFsetid(caps))
351 }
352
ebadf() -> io::Error353 fn ebadf() -> io::Error {
354 io::Error::from_raw_os_error(libc::EBADF)
355 }
356
stat<F: AsRawDescriptor + ?Sized>(f: &F) -> io::Result<libc::stat64>357 fn stat<F: AsRawDescriptor + ?Sized>(f: &F) -> io::Result<libc::stat64> {
358 let mut st = MaybeUninit::<libc::stat64>::zeroed();
359
360 // Safe because this is a constant value and a valid C string.
361 let pathname = unsafe { CStr::from_bytes_with_nul_unchecked(EMPTY_CSTR) };
362
363 // Safe because the kernel will only write data in `st` and we check the return
364 // value.
365 syscall!(unsafe {
366 libc::fstatat64(
367 f.as_raw_descriptor(),
368 pathname.as_ptr(),
369 st.as_mut_ptr(),
370 libc::AT_EMPTY_PATH | libc::AT_SYMLINK_NOFOLLOW,
371 )
372 })?;
373
374 // Safe because the kernel guarantees that the struct is now fully initialized.
375 Ok(unsafe { st.assume_init() })
376 }
377
statat<D: AsRawDescriptor>(dir: &D, name: &CStr) -> io::Result<libc::stat64>378 fn statat<D: AsRawDescriptor>(dir: &D, name: &CStr) -> io::Result<libc::stat64> {
379 let mut st = MaybeUninit::<libc::stat64>::zeroed();
380
381 // Safe because the kernel will only write data in `st` and we check the return
382 // value.
383 syscall!(unsafe {
384 libc::fstatat64(
385 dir.as_raw_descriptor(),
386 name.as_ptr(),
387 st.as_mut_ptr(),
388 libc::AT_SYMLINK_NOFOLLOW,
389 )
390 })?;
391
392 // Safe because the kernel guarantees that the struct is now fully initialized.
393 Ok(unsafe { st.assume_init() })
394 }
395
396 /// The caching policy that the file system should report to the FUSE client. By default the FUSE
397 /// protocol uses close-to-open consistency. This means that any cached contents of the file are
398 /// invalidated the next time that file is opened.
399 #[derive(Debug, Clone, Eq, PartialEq)]
400 pub enum CachePolicy {
401 /// The client should never cache file data and all I/O should be directly forwarded to the
402 /// server. This policy must be selected when file contents may change without the knowledge of
403 /// the FUSE client (i.e., the file system does not have exclusive access to the directory).
404 Never,
405
406 /// The client is free to choose when and how to cache file data. This is the default policy and
407 /// uses close-to-open consistency as described in the enum documentation.
408 Auto,
409
410 /// The client should always cache file data. This means that the FUSE client will not
411 /// invalidate any cached data that was returned by the file system the last time the file was
412 /// opened. This policy should only be selected when the file system has exclusive access to the
413 /// directory.
414 Always,
415 }
416
417 impl FromStr for CachePolicy {
418 type Err = &'static str;
419
from_str(s: &str) -> Result<Self, Self::Err>420 fn from_str(s: &str) -> Result<Self, Self::Err> {
421 match s {
422 "never" | "Never" | "NEVER" => Ok(CachePolicy::Never),
423 "auto" | "Auto" | "AUTO" => Ok(CachePolicy::Auto),
424 "always" | "Always" | "ALWAYS" => Ok(CachePolicy::Always),
425 _ => Err("invalid cache policy"),
426 }
427 }
428 }
429
430 impl Default for CachePolicy {
default() -> Self431 fn default() -> Self {
432 CachePolicy::Auto
433 }
434 }
435
436 /// Options that configure the behavior of the file system.
437 #[derive(Debug, Clone)]
438 pub struct Config {
439 /// How long the FUSE client should consider directory entries to be valid. If the contents of a
440 /// directory can only be modified by the FUSE client (i.e., the file system has exclusive
441 /// access), then this should be a large value.
442 ///
443 /// The default value for this option is 5 seconds.
444 pub entry_timeout: Duration,
445
446 /// How long the FUSE client should consider file and directory attributes to be valid. If the
447 /// attributes of a file or directory can only be modified by the FUSE client (i.e., the file
448 /// system has exclusive access), then this should be set to a large value.
449 ///
450 /// The default value for this option is 5 seconds.
451 pub attr_timeout: Duration,
452
453 /// The caching policy the file system should use. See the documentation of `CachePolicy` for
454 /// more details.
455 pub cache_policy: CachePolicy,
456
457 /// Whether the file system should enabled writeback caching. This can improve performance as it
458 /// allows the FUSE client to cache and coalesce multiple writes before sending them to the file
459 /// system. However, enabling this option can increase the risk of data corruption if the file
460 /// contents can change without the knowledge of the FUSE client (i.e., the server does **NOT**
461 /// have exclusive access). Additionally, the file system should have read access to all files
462 /// in the directory it is serving as the FUSE client may send read requests even for files
463 /// opened with `O_WRONLY`.
464 ///
465 /// Therefore callers should only enable this option when they can guarantee that: 1) the file
466 /// system has exclusive access to the directory and 2) the file system has read permissions for
467 /// all files in that directory.
468 ///
469 /// The default value for this option is `false`.
470 pub writeback: bool,
471
472 /// Controls whether security.* xattrs (except for security.selinux) are re-written. When this
473 /// is set to true, the server will add a "user.virtiofs" prefix to xattrs in the security
474 /// namespace. Setting these xattrs requires CAP_SYS_ADMIN in the namespace where the file
475 /// system was mounted and since the server usually runs in an unprivileged user namespace, it's
476 /// unlikely to have that capability.
477 ///
478 /// The default value for this option is `false`.
479 pub rewrite_security_xattrs: bool,
480
481 /// Use case-insensitive lookups for directory entries (ASCII only).
482 ///
483 /// The default value for this option is `false`.
484 pub ascii_casefold: bool,
485
486 // UIDs which are privileged to perform quota-related operations. We cannot perform a CAP_FOWNER
487 // check so we consult this list when the VM tries to set the project quota and the process uid
488 // doesn't match the owner uid. In that case, all uids in this list are treated as if they have
489 // CAP_FOWNER.
490 #[cfg(feature = "chromeos")]
491 pub privileged_quota_uids: Vec<libc::uid_t>,
492
493 /// Use DAX for shared files.
494 ///
495 /// Enabling DAX can improve performance for frequently accessed files by mapping regions of the
496 /// file directly into the VM's memory region, allowing direct access with the cost of slightly
497 /// increased latency the first time the file is accessed. Additionally, since the mapping is
498 /// shared directly from the host kernel's file cache, enabling DAX can improve performance even
499 /// when the cache policy is `Never`.
500 ///
501 /// The default value for this option is `false`.
502 pub use_dax: bool,
503
504 /// Enable support for POSIX acls.
505 ///
506 /// Enable POSIX acl support for the shared directory. This requires that the underlying file
507 /// system also supports POSIX acls.
508 ///
509 /// The default value for this option is `true`.
510 pub posix_acl: bool,
511 }
512
513 impl Default for Config {
default() -> Self514 fn default() -> Self {
515 Config {
516 entry_timeout: Duration::from_secs(5),
517 attr_timeout: Duration::from_secs(5),
518 cache_policy: Default::default(),
519 writeback: false,
520 rewrite_security_xattrs: false,
521 ascii_casefold: false,
522 #[cfg(feature = "chromeos")]
523 privileged_quota_uids: Default::default(),
524 use_dax: false,
525 posix_acl: true,
526 }
527 }
528 }
529
530 /// A file system that simply "passes through" all requests it receives to the underlying file
531 /// system. To keep the implementation simple it servers the contents of its root directory. Users
532 /// that wish to serve only a specific directory should set up the environment so that that
533 /// directory ends up as the root of the file system process. One way to accomplish this is via a
534 /// combination of mount namespaces and the pivot_root system call.
535 pub struct PassthroughFs {
536 // File descriptors for various points in the file system tree.
537 inodes: Mutex<MultikeyBTreeMap<Inode, InodeAltKey, Arc<InodeData>>>,
538 next_inode: AtomicU64,
539
540 // File descriptors for open files and directories. Unlike the fds in `inodes`, these _can_ be
541 // used for reading and writing data.
542 handles: Mutex<BTreeMap<Handle, Arc<HandleData>>>,
543 next_handle: AtomicU64,
544
545 // File descriptor pointing to the `/proc` directory. This is used to convert an fd from
546 // `inodes` into one that can go into `handles`. This is accomplished by reading the
547 // `self/fd/{}` symlink. We keep an open fd here in case the file system tree that we are meant
548 // to be serving doesn't have access to `/proc`.
549 proc: File,
550
551 // Whether writeback caching is enabled for this directory. This will only be true when
552 // `cfg.writeback` is true and `init` was called with `FsOptions::WRITEBACK_CACHE`.
553 writeback: AtomicBool,
554
555 // Whether zero message opens are supported by the kernel driver.
556 zero_message_open: AtomicBool,
557
558 // Whether zero message opendir is supported by the kernel driver.
559 zero_message_opendir: AtomicBool,
560
561 // Used to communicate with other processes using D-Bus.
562 #[cfg(feature = "chromeos")]
563 dbus_connection: Option<Mutex<dbus::blocking::Connection>>,
564 #[cfg(feature = "chromeos")]
565 dbus_fd: Option<std::os::unix::io::RawFd>,
566
567 cfg: Config,
568 }
569
570 impl PassthroughFs {
new(cfg: Config) -> io::Result<PassthroughFs>571 pub fn new(cfg: Config) -> io::Result<PassthroughFs> {
572 // Safe because this is a constant value and a valid C string.
573 let proc_cstr = unsafe { CStr::from_bytes_with_nul_unchecked(PROC_CSTR) };
574
575 // Safe because this doesn't modify any memory and we check the return value.
576 let raw_descriptor = syscall!(unsafe {
577 libc::openat64(
578 libc::AT_FDCWD,
579 proc_cstr.as_ptr(),
580 libc::O_PATH | libc::O_NOFOLLOW | libc::O_CLOEXEC,
581 )
582 })?;
583
584 // Privileged UIDs can use D-Bus to perform some operations.
585 #[cfg(feature = "chromeos")]
586 let (dbus_connection, dbus_fd) = if cfg.privileged_quota_uids.is_empty() {
587 (None, None)
588 } else {
589 let mut channel = dbus::channel::Channel::get_private(dbus::channel::BusType::System)
590 .map_err(|e| io::Error::new(io::ErrorKind::Other, e))?;
591 channel.set_watch_enabled(true);
592 let dbus_fd = channel.watch().fd;
593 channel.set_watch_enabled(false);
594 (
595 Some(Mutex::new(dbus::blocking::Connection::from(channel))),
596 Some(dbus_fd),
597 )
598 };
599
600 // Safe because we just opened this descriptor.
601 let proc = unsafe { File::from_raw_descriptor(raw_descriptor) };
602
603 Ok(PassthroughFs {
604 inodes: Mutex::new(MultikeyBTreeMap::new()),
605 next_inode: AtomicU64::new(ROOT_ID + 1),
606
607 handles: Mutex::new(BTreeMap::new()),
608 next_handle: AtomicU64::new(1),
609
610 proc,
611
612 writeback: AtomicBool::new(false),
613 zero_message_open: AtomicBool::new(false),
614 zero_message_opendir: AtomicBool::new(false),
615
616 #[cfg(feature = "chromeos")]
617 dbus_connection,
618 #[cfg(feature = "chromeos")]
619 dbus_fd,
620
621 cfg,
622 })
623 }
624
cfg(&self) -> &Config625 pub fn cfg(&self) -> &Config {
626 &self.cfg
627 }
628
keep_rds(&self) -> Vec<RawDescriptor>629 pub fn keep_rds(&self) -> Vec<RawDescriptor> {
630 #[cfg_attr(not(feature = "chromeos"), allow(unused_mut))]
631 let mut keep_rds = vec![self.proc.as_raw_descriptor()];
632 #[cfg(feature = "chromeos")]
633 if let Some(fd) = self.dbus_fd {
634 keep_rds.push(fd);
635 }
636 keep_rds
637 }
638
rewrite_xattr_name<'xattr>(&self, name: &'xattr CStr) -> Cow<'xattr, CStr>639 fn rewrite_xattr_name<'xattr>(&self, name: &'xattr CStr) -> Cow<'xattr, CStr> {
640 if !self.cfg.rewrite_security_xattrs {
641 return Cow::Borrowed(name);
642 }
643
644 // Does not include nul-terminator.
645 let buf = name.to_bytes();
646 if !buf.starts_with(SECURITY_XATTR) || buf == SELINUX_XATTR {
647 return Cow::Borrowed(name);
648 }
649
650 let mut newname = USER_VIRTIOFS_XATTR.to_vec();
651 newname.extend_from_slice(buf);
652
653 // The unwrap is safe here because the prefix doesn't contain any interior nul-bytes and the
654 // to_bytes() call above will not return a byte slice with any interior nul-bytes either.
655 Cow::Owned(CString::new(newname).expect("Failed to re-write xattr name"))
656 }
657
find_inode(&self, inode: Inode) -> io::Result<Arc<InodeData>>658 fn find_inode(&self, inode: Inode) -> io::Result<Arc<InodeData>> {
659 self.inodes
660 .lock()
661 .get(&inode)
662 .map(Arc::clone)
663 .ok_or_else(ebadf)
664 }
665
find_handle(&self, handle: Handle, inode: Inode) -> io::Result<Arc<HandleData>>666 fn find_handle(&self, handle: Handle, inode: Inode) -> io::Result<Arc<HandleData>> {
667 self.handles
668 .lock()
669 .get(&handle)
670 .filter(|hd| hd.inode == inode)
671 .map(Arc::clone)
672 .ok_or_else(ebadf)
673 }
674
open_fd(&self, fd: RawDescriptor, flags: i32) -> io::Result<File>675 fn open_fd(&self, fd: RawDescriptor, flags: i32) -> io::Result<File> {
676 let pathname = CString::new(format!("self/fd/{}", fd))
677 .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
678
679 // Safe because this doesn't modify any memory and we check the return value. We don't
680 // really check `flags` because if the kernel can't handle poorly specified flags then we
681 // have much bigger problems. Also, clear the `O_NOFOLLOW` flag if it is set since we need
682 // to follow the `/proc/self/fd` symlink to get the file.
683 let raw_descriptor = syscall!(unsafe {
684 libc::openat64(
685 self.proc.as_raw_descriptor(),
686 pathname.as_ptr(),
687 (flags | libc::O_CLOEXEC) & !(libc::O_NOFOLLOW | libc::O_DIRECT),
688 )
689 })?;
690
691 // Safe because we just opened this descriptor.
692 Ok(unsafe { File::from_raw_descriptor(raw_descriptor) })
693 }
694
open_inode(&self, inode: &InodeData, mut flags: i32) -> io::Result<File>695 fn open_inode(&self, inode: &InodeData, mut flags: i32) -> io::Result<File> {
696 // When writeback caching is enabled, the kernel may send read requests even if the
697 // userspace program opened the file write-only. So we need to ensure that we have opened
698 // the file for reading as well as writing.
699 let writeback = self.writeback.load(Ordering::Relaxed);
700 if writeback && flags & libc::O_ACCMODE == libc::O_WRONLY {
701 flags &= !libc::O_ACCMODE;
702 flags |= libc::O_RDWR;
703 }
704
705 // When writeback caching is enabled the kernel is responsible for handling `O_APPEND`.
706 // However, this breaks atomicity as the file may have changed on disk, invalidating the
707 // cached copy of the data in the kernel and the offset that the kernel thinks is the end of
708 // the file. Just allow this for now as it is the user's responsibility to enable writeback
709 // caching only for directories that are not shared. It also means that we need to clear the
710 // `O_APPEND` flag.
711 if writeback && flags & libc::O_APPEND != 0 {
712 flags &= !libc::O_APPEND;
713 }
714
715 self.open_fd(inode.as_raw_descriptor(), flags)
716 }
717
718 // Creates a new entry for `f` or increases the refcount of the existing entry for `f`.
add_entry(&self, f: File, st: libc::stat64, open_flags: libc::c_int) -> Entry719 fn add_entry(&self, f: File, st: libc::stat64, open_flags: libc::c_int) -> Entry {
720 let altkey = InodeAltKey {
721 ino: st.st_ino,
722 dev: st.st_dev,
723 };
724 let data = self.inodes.lock().get_alt(&altkey).map(Arc::clone);
725
726 let inode = if let Some(data) = data {
727 // Matches with the release store in `forget`.
728 data.refcount.fetch_add(1, Ordering::Acquire);
729 data.inode
730 } else {
731 // There is a possible race here where 2 threads end up adding the same file
732 // into the inode list. However, since each of those will get a unique Inode
733 // value and unique file descriptors this shouldn't be that much of a problem.
734 let inode = self.next_inode.fetch_add(1, Ordering::Relaxed);
735 self.inodes.lock().insert(
736 inode,
737 InodeAltKey {
738 ino: st.st_ino,
739 dev: st.st_dev,
740 },
741 Arc::new(InodeData {
742 inode,
743 file: Mutex::new((f, open_flags)),
744 refcount: AtomicU64::new(1),
745 filetype: st.st_mode.into(),
746 }),
747 );
748
749 inode
750 };
751
752 Entry {
753 inode,
754 generation: 0,
755 attr: st,
756 attr_timeout: self.cfg.attr_timeout,
757 entry_timeout: self.cfg.entry_timeout,
758 }
759 }
760
761 // Performs an ascii case insensitive lookup.
ascii_casefold_lookup(&self, parent: &InodeData, name: &[u8]) -> io::Result<Entry>762 fn ascii_casefold_lookup(&self, parent: &InodeData, name: &[u8]) -> io::Result<Entry> {
763 let mut buf = [0u8; 1024];
764 let mut offset = 0;
765 loop {
766 let mut read_dir = ReadDir::new(parent, offset, &mut buf[..])?;
767 if read_dir.remaining() == 0 {
768 break;
769 }
770
771 while let Some(entry) = read_dir.next() {
772 offset = entry.offset as libc::off64_t;
773 if name.eq_ignore_ascii_case(entry.name.to_bytes()) {
774 return self.do_lookup(parent, entry.name);
775 }
776 }
777 }
778 Err(io::Error::from_raw_os_error(libc::ENOENT))
779 }
780
do_lookup(&self, parent: &InodeData, name: &CStr) -> io::Result<Entry>781 fn do_lookup(&self, parent: &InodeData, name: &CStr) -> io::Result<Entry> {
782 let st = statat(parent, name)?;
783
784 let mut flags = libc::O_RDONLY | libc::O_NOFOLLOW | libc::O_CLOEXEC;
785 match FileType::from(st.st_mode) {
786 FileType::Regular => {}
787 FileType::Directory => flags |= libc::O_DIRECTORY,
788 FileType::Other => flags |= libc::O_PATH,
789 }
790
791 // Safe because this doesn't modify any memory and we check the return value.
792 let f = unsafe {
793 File::from_raw_descriptor(syscall!(libc::openat64(
794 parent.as_raw_descriptor(),
795 name.as_ptr(),
796 flags
797 ))?)
798 };
799
800 Ok(self.add_entry(f, st, flags))
801 }
802
do_open(&self, inode: Inode, flags: u32) -> io::Result<(Option<Handle>, OpenOptions)>803 fn do_open(&self, inode: Inode, flags: u32) -> io::Result<(Option<Handle>, OpenOptions)> {
804 let inode_data = self.find_inode(inode)?;
805
806 let file = Mutex::new(self.open_inode(&inode_data, flags as i32)?);
807
808 let handle = self.next_handle.fetch_add(1, Ordering::Relaxed);
809 let data = HandleData { inode, file };
810
811 self.handles.lock().insert(handle, Arc::new(data));
812
813 let mut opts = OpenOptions::empty();
814 match self.cfg.cache_policy {
815 // We only set the direct I/O option on files.
816 CachePolicy::Never => opts.set(
817 OpenOptions::DIRECT_IO,
818 flags & (libc::O_DIRECTORY as u32) == 0,
819 ),
820 CachePolicy::Always => {
821 opts |= if flags & (libc::O_DIRECTORY as u32) == 0 {
822 OpenOptions::KEEP_CACHE
823 } else {
824 OpenOptions::CACHE_DIR
825 }
826 }
827 _ => {}
828 };
829
830 Ok((Some(handle), opts))
831 }
832
do_release(&self, inode: Inode, handle: Handle) -> io::Result<()>833 fn do_release(&self, inode: Inode, handle: Handle) -> io::Result<()> {
834 let mut handles = self.handles.lock();
835
836 if let btree_map::Entry::Occupied(e) = handles.entry(handle) {
837 if e.get().inode == inode {
838 // We don't need to close the file here because that will happen automatically when
839 // the last `Arc` is dropped.
840 e.remove();
841 return Ok(());
842 }
843 }
844
845 Err(ebadf())
846 }
847
do_getattr(&self, inode: &InodeData) -> io::Result<(libc::stat64, Duration)>848 fn do_getattr(&self, inode: &InodeData) -> io::Result<(libc::stat64, Duration)> {
849 let st = stat(inode)?;
850
851 Ok((st, self.cfg.attr_timeout))
852 }
853
do_unlink(&self, parent: &InodeData, name: &CStr, flags: libc::c_int) -> io::Result<()>854 fn do_unlink(&self, parent: &InodeData, name: &CStr, flags: libc::c_int) -> io::Result<()> {
855 // Safe because this doesn't modify any memory and we check the return value.
856 syscall!(unsafe { libc::unlinkat(parent.as_raw_descriptor(), name.as_ptr(), flags) })?;
857 Ok(())
858 }
859
do_fsync<F: AsRawDescriptor>(&self, file: &F, datasync: bool) -> io::Result<()>860 fn do_fsync<F: AsRawDescriptor>(&self, file: &F, datasync: bool) -> io::Result<()> {
861 // Safe because this doesn't modify any memory and we check the return value.
862 syscall!(unsafe {
863 if datasync {
864 libc::fdatasync(file.as_raw_descriptor())
865 } else {
866 libc::fsync(file.as_raw_descriptor())
867 }
868 })?;
869
870 Ok(())
871 }
872
873 // Changes the CWD to `self.proc`, runs `f`, and then changes the CWD back to the root
874 // directory. This effectively emulates an *at syscall starting at /proc, which is useful when
875 // there is no *at syscall available. Panics if any of the fchdir calls fail or if there is no
876 // root inode.
with_proc_chdir<F, T>(&self, f: F) -> T where F: FnOnce() -> T,877 fn with_proc_chdir<F, T>(&self, f: F) -> T
878 where
879 F: FnOnce() -> T,
880 {
881 let root = self.find_inode(ROOT_ID).expect("failed to find root inode");
882
883 // Safe because this doesn't modify any memory and we check the return value. Since the
884 // fchdir should never fail we just use debug_asserts.
885 let proc_cwd = unsafe { libc::fchdir(self.proc.as_raw_descriptor()) };
886 debug_assert_eq!(
887 proc_cwd,
888 0,
889 "failed to fchdir to /proc: {}",
890 io::Error::last_os_error()
891 );
892
893 let res = f();
894
895 // Safe because this doesn't modify any memory and we check the return value. Since the
896 // fchdir should never fail we just use debug_asserts.
897 let root_cwd = unsafe { libc::fchdir(root.as_raw_descriptor()) };
898 debug_assert_eq!(
899 root_cwd,
900 0,
901 "failed to fchdir back to root directory: {}",
902 io::Error::last_os_error()
903 );
904
905 res
906 }
907
do_getxattr(&self, inode: &InodeData, name: &CStr, value: &mut [u8]) -> io::Result<usize>908 fn do_getxattr(&self, inode: &InodeData, name: &CStr, value: &mut [u8]) -> io::Result<usize> {
909 let res = if inode.filetype == FileType::Other {
910 // For non-regular files and directories, we cannot open the fd normally. Instead we
911 // emulate an _at syscall by changing the CWD to /proc, running the path based syscall,
912 // and then setting the CWD back to the root directory.
913 let path = CString::new(format!("self/fd/{}", inode.as_raw_descriptor()))
914 .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
915
916 // Safe because this will only modify `value` and we check the return value.
917 self.with_proc_chdir(|| unsafe {
918 libc::getxattr(
919 path.as_ptr(),
920 name.as_ptr(),
921 value.as_mut_ptr() as *mut libc::c_void,
922 value.len() as libc::size_t,
923 )
924 })
925 } else {
926 // For regular files and directories, we can just use fgetxattr. Safe because this will
927 // only write to `value` and we check the return value.
928 unsafe {
929 libc::fgetxattr(
930 inode.as_raw_descriptor(),
931 name.as_ptr(),
932 value.as_mut_ptr() as *mut libc::c_void,
933 value.len() as libc::size_t,
934 )
935 }
936 };
937
938 if res < 0 {
939 Err(io::Error::last_os_error())
940 } else {
941 Ok(res as usize)
942 }
943 }
944
get_encryption_policy_ex<R: io::Read>( &self, inode: Inode, handle: Handle, mut r: R, ) -> io::Result<IoctlReply>945 fn get_encryption_policy_ex<R: io::Read>(
946 &self,
947 inode: Inode,
948 handle: Handle,
949 mut r: R,
950 ) -> io::Result<IoctlReply> {
951 let data: Arc<dyn AsRawDescriptor> = if self.zero_message_open.load(Ordering::Relaxed) {
952 self.find_inode(inode)?
953 } else {
954 self.find_handle(handle, inode)?
955 };
956
957 // Safe because this only has integer fields.
958 let mut arg = unsafe { MaybeUninit::<fscrypt_get_policy_ex_arg>::zeroed().assume_init() };
959 r.read_exact(arg.policy_size.as_mut_slice())?;
960
961 let policy_size = cmp::min(arg.policy_size, size_of::<fscrypt_policy>() as u64);
962 arg.policy_size = policy_size;
963
964 // Safe because the kernel will only write to `arg` and we check the return value.
965 let res =
966 unsafe { ioctl_with_mut_ptr(&*data, FS_IOC_GET_ENCRYPTION_POLICY_EX(), &mut arg) };
967 if res < 0 {
968 Ok(IoctlReply::Done(Err(io::Error::last_os_error())))
969 } else {
970 let len = size_of::<u64>() + arg.policy_size as usize;
971 Ok(IoctlReply::Done(Ok(arg.as_slice()[..len].to_vec())))
972 }
973 }
974
get_fsxattr(&self, inode: Inode, handle: Handle) -> io::Result<IoctlReply>975 fn get_fsxattr(&self, inode: Inode, handle: Handle) -> io::Result<IoctlReply> {
976 let data: Arc<dyn AsRawDescriptor> = if self.zero_message_open.load(Ordering::Relaxed) {
977 self.find_inode(inode)?
978 } else {
979 self.find_handle(handle, inode)?
980 };
981
982 let mut buf = MaybeUninit::<fsxattr>::zeroed();
983
984 // Safe because the kernel will only write to `buf` and we check the return value.
985 let res = unsafe { ioctl_with_mut_ptr(&*data, FS_IOC_FSGETXATTR(), buf.as_mut_ptr()) };
986 if res < 0 {
987 Ok(IoctlReply::Done(Err(io::Error::last_os_error())))
988 } else {
989 // Safe because the kernel guarantees that the policy is now initialized.
990 let xattr = unsafe { buf.assume_init() };
991 Ok(IoctlReply::Done(Ok(xattr.as_slice().to_vec())))
992 }
993 }
994
set_fsxattr<R: io::Read>( &self, #[cfg_attr(not(feature = "chromeos"), allow(unused_variables))] ctx: Context, inode: Inode, handle: Handle, r: R, ) -> io::Result<IoctlReply>995 fn set_fsxattr<R: io::Read>(
996 &self,
997 #[cfg_attr(not(feature = "chromeos"), allow(unused_variables))] ctx: Context,
998 inode: Inode,
999 handle: Handle,
1000 r: R,
1001 ) -> io::Result<IoctlReply> {
1002 let data: Arc<dyn AsRawDescriptor> = if self.zero_message_open.load(Ordering::Relaxed) {
1003 self.find_inode(inode)?
1004 } else {
1005 self.find_handle(handle, inode)?
1006 };
1007
1008 let in_attr = fsxattr::from_reader(r)?;
1009
1010 #[cfg(feature = "chromeos")]
1011 let st = stat(&*data)?;
1012
1013 // Changing quota project ID requires CAP_FOWNER or being file owner.
1014 // Here we use privileged_quota_uids because we cannot perform a CAP_FOWNER check.
1015 #[cfg(feature = "chromeos")]
1016 if ctx.uid == st.st_uid || self.cfg.privileged_quota_uids.contains(&ctx.uid) {
1017 // Get the current fsxattr.
1018 let mut buf = MaybeUninit::<fsxattr>::zeroed();
1019 // Safe because the kernel will only write to `buf` and we check the return value.
1020 let res = unsafe { ioctl_with_mut_ptr(&*data, FS_IOC_FSGETXATTR(), buf.as_mut_ptr()) };
1021 if res < 0 {
1022 return Ok(IoctlReply::Done(Err(io::Error::last_os_error())));
1023 }
1024 // Safe because the kernel guarantees that the policy is now initialized.
1025 let current_attr = unsafe { buf.assume_init() };
1026
1027 // Project ID cannot be changed inside a user namespace.
1028 // Use UserDataAuth to avoid this restriction.
1029 if current_attr.fsx_projid != in_attr.fsx_projid {
1030 let connection = self.dbus_connection.as_ref().unwrap().lock();
1031 let proxy = connection.with_proxy(
1032 "org.chromium.UserDataAuth",
1033 "/org/chromium/UserDataAuth",
1034 DEFAULT_DBUS_TIMEOUT,
1035 );
1036 let mut proto: SetMediaRWDataFileProjectIdRequest = Message::new();
1037 proto.project_id = in_attr.fsx_projid;
1038 // Safe because data is a valid file descriptor.
1039 let fd = unsafe { dbus::arg::OwnedFd::new(base::clone_descriptor(&*data)?) };
1040 match proxy.set_media_rwdata_file_project_id(fd, proto.write_to_bytes().unwrap()) {
1041 Ok(r) => {
1042 let r = protobuf::parse_from_bytes::<SetMediaRWDataFileProjectIdReply>(&r)
1043 .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
1044 if !r.success {
1045 return Ok(IoctlReply::Done(Err(io::Error::from_raw_os_error(
1046 r.error,
1047 ))));
1048 }
1049 }
1050 Err(e) => {
1051 return Err(io::Error::new(io::ErrorKind::Other, e));
1052 }
1053 };
1054 }
1055 }
1056
1057 // Safe because this doesn't modify any memory and we check the return value.
1058 let res = unsafe { ioctl_with_ptr(&*data, FS_IOC_FSSETXATTR(), &in_attr) };
1059 if res < 0 {
1060 Ok(IoctlReply::Done(Err(io::Error::last_os_error())))
1061 } else {
1062 Ok(IoctlReply::Done(Ok(Vec::new())))
1063 }
1064 }
1065
get_flags(&self, inode: Inode, handle: Handle) -> io::Result<IoctlReply>1066 fn get_flags(&self, inode: Inode, handle: Handle) -> io::Result<IoctlReply> {
1067 let data: Arc<dyn AsRawDescriptor> = if self.zero_message_open.load(Ordering::Relaxed) {
1068 self.find_inode(inode)?
1069 } else {
1070 self.find_handle(handle, inode)?
1071 };
1072
1073 // The ioctl encoding is a long but the parameter is actually an int.
1074 let mut flags: c_int = 0;
1075
1076 // Safe because the kernel will only write to `flags` and we check the return value.
1077 let res = unsafe { ioctl_with_mut_ptr(&*data, FS_IOC_GETFLAGS(), &mut flags) };
1078 if res < 0 {
1079 Ok(IoctlReply::Done(Err(io::Error::last_os_error())))
1080 } else {
1081 Ok(IoctlReply::Done(Ok(flags.to_ne_bytes().to_vec())))
1082 }
1083 }
1084
set_flags<R: io::Read>(&self, inode: Inode, handle: Handle, r: R) -> io::Result<IoctlReply>1085 fn set_flags<R: io::Read>(&self, inode: Inode, handle: Handle, r: R) -> io::Result<IoctlReply> {
1086 let data: Arc<dyn AsRawDescriptor> = if self.zero_message_open.load(Ordering::Relaxed) {
1087 self.find_inode(inode)?
1088 } else {
1089 self.find_handle(handle, inode)?
1090 };
1091
1092 // The ioctl encoding is a long but the parameter is actually an int.
1093 let flags = c_int::from_reader(r)?;
1094
1095 // Safe because this doesn't modify any memory and we check the return value.
1096 let res = unsafe { ioctl_with_ptr(&*data, FS_IOC_SETFLAGS(), &flags) };
1097 if res < 0 {
1098 Ok(IoctlReply::Done(Err(io::Error::last_os_error())))
1099 } else {
1100 Ok(IoctlReply::Done(Ok(Vec::new())))
1101 }
1102 }
1103
enable_verity<R: io::Read>( &self, inode: Inode, handle: Handle, mut r: R, ) -> io::Result<IoctlReply>1104 fn enable_verity<R: io::Read>(
1105 &self,
1106 inode: Inode,
1107 handle: Handle,
1108 mut r: R,
1109 ) -> io::Result<IoctlReply> {
1110 let inode_data = self.find_inode(inode)?;
1111
1112 // These match the return codes from `fsverity_ioctl_enable` in the kernel.
1113 match inode_data.filetype {
1114 FileType::Regular => {}
1115 FileType::Directory => return Err(io::Error::from_raw_os_error(libc::EISDIR)),
1116 FileType::Other => return Err(io::Error::from_raw_os_error(libc::EINVAL)),
1117 }
1118
1119 {
1120 // We cannot enable verity while holding a writable fd so get a new one, if necessary.
1121 let mut file = inode_data.file.lock();
1122 let mut flags = file.1;
1123 match flags & libc::O_ACCMODE {
1124 libc::O_WRONLY | libc::O_RDWR => {
1125 flags &= !libc::O_ACCMODE;
1126 flags |= libc::O_RDONLY;
1127
1128 // We need to get a read-only handle for this file.
1129 let newfile = self.open_fd(file.0.as_raw_descriptor(), libc::O_RDONLY)?;
1130 *file = (newfile, flags);
1131 }
1132 libc::O_RDONLY => {}
1133 _ => panic!("Unexpected flags: {:#x}", flags),
1134 }
1135 }
1136
1137 let data: Arc<dyn AsRawDescriptor> = if self.zero_message_open.load(Ordering::Relaxed) {
1138 inode_data
1139 } else {
1140 let data = self.find_handle(handle, inode)?;
1141
1142 {
1143 // We can't enable verity while holding a writable fd. We don't know whether the file
1144 // was opened for writing so check it here. We don't expect this to be a frequent
1145 // operation so the extra latency should be fine.
1146 let mut file = data.file.lock();
1147 let flags = FileFlags::from_file(&*file).map_err(io::Error::from)?;
1148 match flags {
1149 FileFlags::ReadWrite | FileFlags::Write => {
1150 // We need to get a read-only handle for this file.
1151 *file = self.open_fd(file.as_raw_descriptor(), libc::O_RDONLY)?;
1152 }
1153 FileFlags::Read => {}
1154 }
1155 }
1156
1157 data
1158 };
1159
1160 let mut arg = fsverity_enable_arg::from_reader(&mut r)?;
1161
1162 let mut salt;
1163 if arg.salt_size > 0 {
1164 if arg.salt_size > self.max_buffer_size() {
1165 return Ok(IoctlReply::Done(Err(io::Error::from_raw_os_error(
1166 libc::ENOMEM,
1167 ))));
1168 }
1169 salt = vec![0; arg.salt_size as usize];
1170 r.read_exact(&mut salt)?;
1171 arg.salt_ptr = salt.as_ptr() as usize as u64;
1172 } else {
1173 arg.salt_ptr = 0;
1174 }
1175
1176 let mut sig;
1177 if arg.sig_size > 0 {
1178 if arg.sig_size > self.max_buffer_size() {
1179 return Ok(IoctlReply::Done(Err(io::Error::from_raw_os_error(
1180 libc::ENOMEM,
1181 ))));
1182 }
1183 sig = vec![0; arg.sig_size as usize];
1184 r.read_exact(&mut sig)?;
1185 arg.sig_ptr = sig.as_ptr() as usize as u64;
1186 } else {
1187 arg.sig_ptr = 0;
1188 }
1189
1190 // Safe because this doesn't modify any memory and we check the return value.
1191 let res = unsafe { ioctl_with_ptr(&*data, FS_IOC_ENABLE_VERITY(), &arg) };
1192 if res < 0 {
1193 Ok(IoctlReply::Done(Err(io::Error::last_os_error())))
1194 } else {
1195 Ok(IoctlReply::Done(Ok(Vec::new())))
1196 }
1197 }
1198
measure_verity<R: io::Read>( &self, inode: Inode, handle: Handle, r: R, out_size: u32, ) -> io::Result<IoctlReply>1199 fn measure_verity<R: io::Read>(
1200 &self,
1201 inode: Inode,
1202 handle: Handle,
1203 r: R,
1204 out_size: u32,
1205 ) -> io::Result<IoctlReply> {
1206 let data: Arc<dyn AsRawDescriptor> = if self.zero_message_open.load(Ordering::Relaxed) {
1207 self.find_inode(inode)?
1208 } else {
1209 self.find_handle(handle, inode)?
1210 };
1211
1212 let digest = fsverity_digest::from_reader(r)?;
1213
1214 // Taken from fs/verity/fsverity_private.h.
1215 const FS_VERITY_MAX_DIGEST_SIZE: u16 = 64;
1216
1217 // This digest size is what the fsverity command line utility uses.
1218 const DIGEST_SIZE: u16 = FS_VERITY_MAX_DIGEST_SIZE * 2 + 1;
1219 const BUFLEN: usize = size_of::<fsverity_digest>() + DIGEST_SIZE as usize;
1220 const ROUNDED_LEN: usize =
1221 (BUFLEN + size_of::<fsverity_digest>() - 1) / size_of::<fsverity_digest>();
1222
1223 // Make sure we get a properly aligned allocation.
1224 let mut buf = [MaybeUninit::<fsverity_digest>::uninit(); ROUNDED_LEN];
1225
1226 // Safe because we are only writing data and not reading uninitialized memory.
1227 unsafe {
1228 // TODO: Replace with `MaybeUninit::slice_as_mut_ptr` once it is stabilized.
1229 addr_of_mut!((*(buf.as_mut_ptr() as *mut fsverity_digest)).digest_size)
1230 .write(DIGEST_SIZE)
1231 };
1232
1233 // Safe because this will only modify `buf` and we check the return value.
1234 let res = unsafe { ioctl_with_mut_ptr(&*data, FS_IOC_MEASURE_VERITY(), buf.as_mut_ptr()) };
1235 if res < 0 {
1236 Ok(IoctlReply::Done(Err(io::Error::last_os_error())))
1237 } else {
1238 // Safe because this value was initialized by us already and then overwritten by the
1239 // kernel.
1240 // TODO: Replace with `MaybeUninit::slice_as_ptr` once it is stabilized.
1241 let digest_size =
1242 unsafe { addr_of!((*(buf.as_ptr() as *const fsverity_digest)).digest_size).read() };
1243 let outlen = size_of::<fsverity_digest>() as u32 + u32::from(digest_size);
1244
1245 // The kernel guarantees this but it doesn't hurt to be paranoid.
1246 debug_assert!(outlen <= (ROUNDED_LEN * size_of::<fsverity_digest>()) as u32);
1247 if digest.digest_size < digest_size || out_size < outlen {
1248 return Ok(IoctlReply::Done(Err(io::Error::from_raw_os_error(
1249 libc::EOVERFLOW,
1250 ))));
1251 }
1252
1253 // Safe because any bit pattern is valid for `MaybeUninit<u8>` and `fsverity_digest`
1254 // doesn't contain any references.
1255 let buf: [MaybeUninit<u8>; ROUNDED_LEN * size_of::<fsverity_digest>()] =
1256 unsafe { mem::transmute(buf) };
1257
1258 // Casting to `*const [u8]` is safe because the kernel guarantees that the first
1259 // `outlen` bytes of `buf` are initialized and `MaybeUninit<u8>` is guaranteed to have
1260 // the same layout as `u8`.
1261 // TODO: Replace with `MaybeUninit::slice_assume_init_ref` once it is stabilized.
1262 let buf =
1263 unsafe { &*(&buf[..outlen as usize] as *const [MaybeUninit<u8>] as *const [u8]) };
1264 Ok(IoctlReply::Done(Ok(buf.to_vec())))
1265 }
1266 }
1267 }
1268
forget_one( inodes: &mut MultikeyBTreeMap<Inode, InodeAltKey, Arc<InodeData>>, inode: Inode, count: u64, )1269 fn forget_one(
1270 inodes: &mut MultikeyBTreeMap<Inode, InodeAltKey, Arc<InodeData>>,
1271 inode: Inode,
1272 count: u64,
1273 ) {
1274 if let Some(data) = inodes.get(&inode) {
1275 // Acquiring the write lock on the inode map prevents new lookups from incrementing the
1276 // refcount but there is the possibility that a previous lookup already acquired a
1277 // reference to the inode data and is in the process of updating the refcount so we need
1278 // to loop here until we can decrement successfully.
1279 loop {
1280 let refcount = data.refcount.load(Ordering::Relaxed);
1281
1282 // Saturating sub because it doesn't make sense for a refcount to go below zero and
1283 // we don't want misbehaving clients to cause integer overflow.
1284 let new_count = refcount.saturating_sub(count);
1285
1286 // Synchronizes with the acquire load in `do_lookup`.
1287 if data
1288 .refcount
1289 .compare_exchange_weak(refcount, new_count, Ordering::Release, Ordering::Relaxed)
1290 .is_ok()
1291 {
1292 if new_count == 0 {
1293 // We just removed the last refcount for this inode. There's no need for an
1294 // acquire fence here because we hold a write lock on the inode map and any
1295 // thread that is waiting to do a forget on the same inode will have to wait
1296 // until we release the lock. So there's is no other release store for us to
1297 // synchronize with before deleting the entry.
1298 inodes.remove(&inode);
1299 }
1300 break;
1301 }
1302 }
1303 }
1304 }
1305
1306 // Strips any `user.virtiofs.` prefix from `buf`. If buf contains one or more nul-bytes, each
1307 // nul-byte-separated slice is treated as a C string and the prefix is stripped from each one.
strip_xattr_prefix(buf: &mut Vec<u8>)1308 fn strip_xattr_prefix(buf: &mut Vec<u8>) {
1309 fn next_cstr(b: &[u8], start: usize) -> Option<&[u8]> {
1310 if start >= b.len() {
1311 return None;
1312 }
1313
1314 let end = b[start..]
1315 .iter()
1316 .position(|&c| c == b'\0')
1317 .map(|p| start + p + 1)
1318 .unwrap_or(b.len());
1319
1320 Some(&b[start..end])
1321 }
1322
1323 let mut pos = 0;
1324 while let Some(name) = next_cstr(buf, pos) {
1325 if !name.starts_with(USER_VIRTIOFS_XATTR) {
1326 pos += name.len();
1327 continue;
1328 }
1329
1330 let newlen = name.len() - USER_VIRTIOFS_XATTR.len();
1331 buf.drain(pos..pos + USER_VIRTIOFS_XATTR.len());
1332 pos += newlen;
1333 }
1334 }
1335
1336 impl FileSystem for PassthroughFs {
1337 type Inode = Inode;
1338 type Handle = Handle;
1339 type DirIter = ReadDir<Box<[u8]>>;
1340
init(&self, capable: FsOptions) -> io::Result<FsOptions>1341 fn init(&self, capable: FsOptions) -> io::Result<FsOptions> {
1342 // Safe because this is a constant value and a valid C string.
1343 let root = unsafe { CStr::from_bytes_with_nul_unchecked(ROOT_CSTR) };
1344
1345 let flags = libc::O_DIRECTORY | libc::O_NOFOLLOW | libc::O_CLOEXEC;
1346 // Safe because this doesn't modify any memory and we check the return value.
1347 let raw_descriptor = unsafe { libc::openat64(libc::AT_FDCWD, root.as_ptr(), flags) };
1348 if raw_descriptor < 0 {
1349 return Err(io::Error::last_os_error());
1350 }
1351
1352 // Safe because we just opened this descriptor above.
1353 let f = unsafe { File::from_raw_descriptor(raw_descriptor) };
1354
1355 let st = stat(&f)?;
1356
1357 // Safe because this doesn't modify any memory and there is no need to check the return
1358 // value because this system call always succeeds. We need to clear the umask here because
1359 // we want the client to be able to set all the bits in the mode.
1360 unsafe { libc::umask(0o000) };
1361
1362 let mut inodes = self.inodes.lock();
1363
1364 // Not sure why the root inode gets a refcount of 2 but that's what libfuse does.
1365 inodes.insert(
1366 ROOT_ID,
1367 InodeAltKey {
1368 ino: st.st_ino,
1369 dev: st.st_dev,
1370 },
1371 Arc::new(InodeData {
1372 inode: ROOT_ID,
1373 file: Mutex::new((f, flags)),
1374 refcount: AtomicU64::new(2),
1375 filetype: st.st_mode.into(),
1376 }),
1377 );
1378
1379 let mut opts = FsOptions::DO_READDIRPLUS
1380 | FsOptions::READDIRPLUS_AUTO
1381 | FsOptions::EXPORT_SUPPORT
1382 | FsOptions::DONT_MASK;
1383 if self.cfg.posix_acl {
1384 opts |= FsOptions::POSIX_ACL;
1385 }
1386 if self.cfg.writeback && capable.contains(FsOptions::WRITEBACK_CACHE) {
1387 opts |= FsOptions::WRITEBACK_CACHE;
1388 self.writeback.store(true, Ordering::Relaxed);
1389 }
1390 if self.cfg.cache_policy == CachePolicy::Always {
1391 if capable.contains(FsOptions::ZERO_MESSAGE_OPEN) {
1392 opts |= FsOptions::ZERO_MESSAGE_OPEN;
1393 self.zero_message_open.store(true, Ordering::Relaxed);
1394 }
1395 if capable.contains(FsOptions::ZERO_MESSAGE_OPENDIR) {
1396 opts |= FsOptions::ZERO_MESSAGE_OPENDIR;
1397 self.zero_message_opendir.store(true, Ordering::Relaxed);
1398 }
1399 }
1400 Ok(opts)
1401 }
1402
destroy(&self)1403 fn destroy(&self) {
1404 self.handles.lock().clear();
1405 self.inodes.lock().clear();
1406 }
1407
statfs(&self, _ctx: Context, inode: Inode) -> io::Result<libc::statvfs64>1408 fn statfs(&self, _ctx: Context, inode: Inode) -> io::Result<libc::statvfs64> {
1409 let data = self.find_inode(inode)?;
1410
1411 let mut out = MaybeUninit::<libc::statvfs64>::zeroed();
1412
1413 // Safe because this will only modify `out` and we check the return value.
1414 syscall!(unsafe { libc::fstatvfs64(data.as_raw_descriptor(), out.as_mut_ptr()) })?;
1415
1416 // Safe because the kernel guarantees that `out` has been initialized.
1417 Ok(unsafe { out.assume_init() })
1418 }
1419
lookup(&self, _ctx: Context, parent: Inode, name: &CStr) -> io::Result<Entry>1420 fn lookup(&self, _ctx: Context, parent: Inode, name: &CStr) -> io::Result<Entry> {
1421 let data = self.find_inode(parent)?;
1422 self.do_lookup(&data, name).or_else(|e| {
1423 if self.cfg.ascii_casefold {
1424 self.ascii_casefold_lookup(&data, name.to_bytes())
1425 } else {
1426 Err(e)
1427 }
1428 })
1429 }
1430
forget(&self, _ctx: Context, inode: Inode, count: u64)1431 fn forget(&self, _ctx: Context, inode: Inode, count: u64) {
1432 let mut inodes = self.inodes.lock();
1433
1434 forget_one(&mut inodes, inode, count)
1435 }
1436
batch_forget(&self, _ctx: Context, requests: Vec<(Inode, u64)>)1437 fn batch_forget(&self, _ctx: Context, requests: Vec<(Inode, u64)>) {
1438 let mut inodes = self.inodes.lock();
1439
1440 for (inode, count) in requests {
1441 forget_one(&mut inodes, inode, count)
1442 }
1443 }
1444
opendir( &self, _ctx: Context, inode: Inode, flags: u32, ) -> io::Result<(Option<Handle>, OpenOptions)>1445 fn opendir(
1446 &self,
1447 _ctx: Context,
1448 inode: Inode,
1449 flags: u32,
1450 ) -> io::Result<(Option<Handle>, OpenOptions)> {
1451 if self.zero_message_opendir.load(Ordering::Relaxed) {
1452 Err(io::Error::from_raw_os_error(libc::ENOSYS))
1453 } else {
1454 self.do_open(inode, flags | (libc::O_DIRECTORY as u32))
1455 }
1456 }
1457
releasedir( &self, _ctx: Context, inode: Inode, _flags: u32, handle: Handle, ) -> io::Result<()>1458 fn releasedir(
1459 &self,
1460 _ctx: Context,
1461 inode: Inode,
1462 _flags: u32,
1463 handle: Handle,
1464 ) -> io::Result<()> {
1465 if self.zero_message_opendir.load(Ordering::Relaxed) {
1466 Ok(())
1467 } else {
1468 self.do_release(inode, handle)
1469 }
1470 }
1471
mkdir( &self, ctx: Context, parent: Inode, name: &CStr, mode: u32, umask: u32, ) -> io::Result<Entry>1472 fn mkdir(
1473 &self,
1474 ctx: Context,
1475 parent: Inode,
1476 name: &CStr,
1477 mode: u32,
1478 umask: u32,
1479 ) -> io::Result<Entry> {
1480 let data = self.find_inode(parent)?;
1481
1482 let (_uid, _gid) = set_creds(ctx.uid, ctx.gid)?;
1483 {
1484 let _scoped_umask = ScopedUmask::new(umask);
1485
1486 // Safe because this doesn't modify any memory and we check the return value.
1487 syscall!(unsafe { libc::mkdirat(data.as_raw_descriptor(), name.as_ptr(), mode) })?;
1488 }
1489
1490 self.do_lookup(&data, name)
1491 }
1492
rmdir(&self, _ctx: Context, parent: Inode, name: &CStr) -> io::Result<()>1493 fn rmdir(&self, _ctx: Context, parent: Inode, name: &CStr) -> io::Result<()> {
1494 let data = self.find_inode(parent)?;
1495 self.do_unlink(&data, name, libc::AT_REMOVEDIR)
1496 }
1497
readdir( &self, _ctx: Context, inode: Inode, handle: Handle, size: u32, offset: u64, ) -> io::Result<Self::DirIter>1498 fn readdir(
1499 &self,
1500 _ctx: Context,
1501 inode: Inode,
1502 handle: Handle,
1503 size: u32,
1504 offset: u64,
1505 ) -> io::Result<Self::DirIter> {
1506 let buf = vec![0; size as usize].into_boxed_slice();
1507
1508 if self.zero_message_opendir.load(Ordering::Relaxed) {
1509 let data = self.find_inode(inode)?;
1510 ReadDir::new(&*data, offset as libc::off64_t, buf)
1511 } else {
1512 let data = self.find_handle(handle, inode)?;
1513
1514 let dir = data.file.lock();
1515
1516 ReadDir::new(&*dir, offset as libc::off64_t, buf)
1517 }
1518 }
1519
open( &self, _ctx: Context, inode: Inode, flags: u32, ) -> io::Result<(Option<Handle>, OpenOptions)>1520 fn open(
1521 &self,
1522 _ctx: Context,
1523 inode: Inode,
1524 flags: u32,
1525 ) -> io::Result<(Option<Handle>, OpenOptions)> {
1526 if self.zero_message_open.load(Ordering::Relaxed) {
1527 Err(io::Error::from_raw_os_error(libc::ENOSYS))
1528 } else {
1529 self.do_open(inode, flags)
1530 }
1531 }
1532
release( &self, _ctx: Context, inode: Inode, _flags: u32, handle: Handle, _flush: bool, _flock_release: bool, _lock_owner: Option<u64>, ) -> io::Result<()>1533 fn release(
1534 &self,
1535 _ctx: Context,
1536 inode: Inode,
1537 _flags: u32,
1538 handle: Handle,
1539 _flush: bool,
1540 _flock_release: bool,
1541 _lock_owner: Option<u64>,
1542 ) -> io::Result<()> {
1543 if self.zero_message_open.load(Ordering::Relaxed) {
1544 Ok(())
1545 } else {
1546 self.do_release(inode, handle)
1547 }
1548 }
1549
chromeos_tmpfile( &self, ctx: Context, parent: Self::Inode, mode: u32, umask: u32, ) -> io::Result<Entry>1550 fn chromeos_tmpfile(
1551 &self,
1552 ctx: Context,
1553 parent: Self::Inode,
1554 mode: u32,
1555 umask: u32,
1556 ) -> io::Result<Entry> {
1557 let data = self.find_inode(parent)?;
1558
1559 let (_uid, _gid) = set_creds(ctx.uid, ctx.gid)?;
1560
1561 let tmpflags = libc::O_RDWR | libc::O_TMPFILE | libc::O_CLOEXEC | libc::O_NOFOLLOW;
1562
1563 // Safe because this is a valid c string.
1564 let current_dir = unsafe { CStr::from_bytes_with_nul_unchecked(b".\0") };
1565
1566 let fd = {
1567 let _scoped_umask = ScopedUmask::new(umask);
1568
1569 // Safe because this doesn't modify any memory and we check the return value.
1570 syscall!(unsafe {
1571 libc::openat64(
1572 data.as_raw_descriptor(),
1573 current_dir.as_ptr(),
1574 tmpflags,
1575 mode,
1576 )
1577 })?
1578 };
1579
1580 // Safe because we just opened this fd.
1581 let tmpfile = unsafe { File::from_raw_descriptor(fd) };
1582
1583 let st = stat(&tmpfile)?;
1584 Ok(self.add_entry(tmpfile, st, tmpflags))
1585 }
1586
create( &self, ctx: Context, parent: Inode, name: &CStr, mode: u32, flags: u32, umask: u32, ) -> io::Result<(Entry, Option<Handle>, OpenOptions)>1587 fn create(
1588 &self,
1589 ctx: Context,
1590 parent: Inode,
1591 name: &CStr,
1592 mode: u32,
1593 flags: u32,
1594 umask: u32,
1595 ) -> io::Result<(Entry, Option<Handle>, OpenOptions)> {
1596 let data = self.find_inode(parent)?;
1597
1598 let (_uid, _gid) = set_creds(ctx.uid, ctx.gid)?;
1599
1600 let create_flags =
1601 (flags as i32 | libc::O_CREAT | libc::O_CLOEXEC | libc::O_NOFOLLOW) & !libc::O_DIRECT;
1602
1603 let fd = {
1604 let _scoped_umask = ScopedUmask::new(umask);
1605
1606 // Safe because this doesn't modify any memory and we check the return value. We don't
1607 // really check `flags` because if the kernel can't handle poorly specified flags then
1608 // we have much bigger problems.
1609 syscall!(unsafe {
1610 libc::openat64(data.as_raw_descriptor(), name.as_ptr(), create_flags, mode)
1611 })?
1612 };
1613
1614 // Safe because we just opened this fd.
1615 let file = unsafe { File::from_raw_descriptor(fd) };
1616
1617 let st = stat(&file)?;
1618 let entry = self.add_entry(file, st, create_flags);
1619
1620 let (handle, opts) = if self.zero_message_open.load(Ordering::Relaxed) {
1621 (None, OpenOptions::KEEP_CACHE)
1622 } else {
1623 self.do_open(
1624 entry.inode,
1625 flags & !((libc::O_CREAT | libc::O_EXCL | libc::O_NOCTTY) as u32),
1626 )
1627 .map_err(|e| {
1628 // Don't leak the entry.
1629 self.forget(ctx, entry.inode, 1);
1630 e
1631 })?
1632 };
1633
1634 Ok((entry, handle, opts))
1635 }
1636
unlink(&self, _ctx: Context, parent: Inode, name: &CStr) -> io::Result<()>1637 fn unlink(&self, _ctx: Context, parent: Inode, name: &CStr) -> io::Result<()> {
1638 let data = self.find_inode(parent)?;
1639 self.do_unlink(&data, name, 0)
1640 }
1641
read<W: io::Write + ZeroCopyWriter>( &self, _ctx: Context, inode: Inode, handle: Handle, mut w: W, size: u32, offset: u64, _lock_owner: Option<u64>, _flags: u32, ) -> io::Result<usize>1642 fn read<W: io::Write + ZeroCopyWriter>(
1643 &self,
1644 _ctx: Context,
1645 inode: Inode,
1646 handle: Handle,
1647 mut w: W,
1648 size: u32,
1649 offset: u64,
1650 _lock_owner: Option<u64>,
1651 _flags: u32,
1652 ) -> io::Result<usize> {
1653 if self.zero_message_open.load(Ordering::Relaxed) {
1654 let data = self.find_inode(inode)?;
1655
1656 let mut file = data.file.lock();
1657 let mut flags = file.1;
1658 match flags & libc::O_ACCMODE {
1659 libc::O_WRONLY => {
1660 flags &= !libc::O_WRONLY;
1661 flags |= libc::O_RDWR;
1662
1663 // We need to get a readable handle for this file.
1664 let newfile = self.open_fd(file.0.as_raw_descriptor(), libc::O_RDWR)?;
1665 *file = (newfile, flags);
1666 }
1667 libc::O_RDONLY | libc::O_RDWR => {}
1668 _ => panic!("Unexpected flags: {:#x}", flags),
1669 }
1670
1671 w.write_from(&mut file.0, size as usize, offset)
1672 } else {
1673 let data = self.find_handle(handle, inode)?;
1674
1675 let mut f = data.file.lock();
1676 w.write_from(&mut f, size as usize, offset)
1677 }
1678 }
1679
write<R: io::Read + ZeroCopyReader>( &self, _ctx: Context, inode: Inode, handle: Handle, mut r: R, size: u32, offset: u64, _lock_owner: Option<u64>, _delayed_write: bool, flags: u32, ) -> io::Result<usize>1680 fn write<R: io::Read + ZeroCopyReader>(
1681 &self,
1682 _ctx: Context,
1683 inode: Inode,
1684 handle: Handle,
1685 mut r: R,
1686 size: u32,
1687 offset: u64,
1688 _lock_owner: Option<u64>,
1689 _delayed_write: bool,
1690 flags: u32,
1691 ) -> io::Result<usize> {
1692 // When the WRITE_KILL_PRIV flag is set, drop CAP_FSETID so that the kernel will
1693 // automatically clear the setuid and setgid bits for us.
1694 let _fsetid = if flags & WRITE_KILL_PRIV != 0 {
1695 Some(drop_cap_fsetid()?)
1696 } else {
1697 None
1698 };
1699
1700 if self.zero_message_open.load(Ordering::Relaxed) {
1701 let data = self.find_inode(inode)?;
1702
1703 let mut file = data.file.lock();
1704 let mut flags = file.1;
1705 match flags & libc::O_ACCMODE {
1706 libc::O_RDONLY => {
1707 flags &= !libc::O_RDONLY;
1708 flags |= libc::O_RDWR;
1709
1710 // We need to get a writable handle for this file.
1711 let newfile = self.open_fd(file.0.as_raw_descriptor(), libc::O_RDWR)?;
1712 *file = (newfile, flags);
1713 }
1714 libc::O_WRONLY | libc::O_RDWR => {}
1715 _ => panic!("Unexpected flags: {:#x}", flags),
1716 }
1717
1718 r.read_to(&mut file.0, size as usize, offset)
1719 } else {
1720 let data = self.find_handle(handle, inode)?;
1721
1722 let mut f = data.file.lock();
1723 r.read_to(&mut f, size as usize, offset)
1724 }
1725 }
1726
getattr( &self, _ctx: Context, inode: Inode, _handle: Option<Handle>, ) -> io::Result<(libc::stat64, Duration)>1727 fn getattr(
1728 &self,
1729 _ctx: Context,
1730 inode: Inode,
1731 _handle: Option<Handle>,
1732 ) -> io::Result<(libc::stat64, Duration)> {
1733 let data = self.find_inode(inode)?;
1734 self.do_getattr(&data)
1735 }
1736
setattr( &self, _ctx: Context, inode: Inode, attr: libc::stat64, handle: Option<Handle>, valid: SetattrValid, ) -> io::Result<(libc::stat64, Duration)>1737 fn setattr(
1738 &self,
1739 _ctx: Context,
1740 inode: Inode,
1741 attr: libc::stat64,
1742 handle: Option<Handle>,
1743 valid: SetattrValid,
1744 ) -> io::Result<(libc::stat64, Duration)> {
1745 let inode_data = self.find_inode(inode)?;
1746
1747 enum Data {
1748 Handle(Arc<HandleData>, RawDescriptor),
1749 ProcPath(CString),
1750 }
1751
1752 // If we have a handle then use it otherwise get a new fd from the inode.
1753 let data = if let Some(handle) = handle.filter(|&h| h != 0) {
1754 let hd = self.find_handle(handle, inode)?;
1755
1756 let fd = hd.file.lock().as_raw_descriptor();
1757 Data::Handle(hd, fd)
1758 } else {
1759 let pathname = CString::new(format!("self/fd/{}", inode_data.as_raw_descriptor()))
1760 .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
1761 Data::ProcPath(pathname)
1762 };
1763
1764 if valid.contains(SetattrValid::MODE) {
1765 // Safe because this doesn't modify any memory and we check the return value.
1766 syscall!(unsafe {
1767 match data {
1768 Data::Handle(_, fd) => libc::fchmod(fd, attr.st_mode),
1769 Data::ProcPath(ref p) => {
1770 libc::fchmodat(self.proc.as_raw_descriptor(), p.as_ptr(), attr.st_mode, 0)
1771 }
1772 }
1773 })?;
1774 }
1775
1776 if valid.intersects(SetattrValid::UID | SetattrValid::GID) {
1777 let uid = if valid.contains(SetattrValid::UID) {
1778 attr.st_uid
1779 } else {
1780 // Cannot use -1 here because these are unsigned values.
1781 ::std::u32::MAX
1782 };
1783 let gid = if valid.contains(SetattrValid::GID) {
1784 attr.st_gid
1785 } else {
1786 // Cannot use -1 here because these are unsigned values.
1787 ::std::u32::MAX
1788 };
1789
1790 // Safe because this is a constant value and a valid C string.
1791 let empty = unsafe { CStr::from_bytes_with_nul_unchecked(EMPTY_CSTR) };
1792
1793 // Safe because this doesn't modify any memory and we check the return value.
1794 syscall!(unsafe {
1795 libc::fchownat(
1796 inode_data.as_raw_descriptor(),
1797 empty.as_ptr(),
1798 uid,
1799 gid,
1800 libc::AT_EMPTY_PATH | libc::AT_SYMLINK_NOFOLLOW,
1801 )
1802 })?;
1803 }
1804
1805 if valid.contains(SetattrValid::SIZE) {
1806 // Safe because this doesn't modify any memory and we check the return value.
1807 syscall!(match data {
1808 Data::Handle(_, fd) => unsafe { libc::ftruncate64(fd, attr.st_size) },
1809 _ => {
1810 // There is no `ftruncateat` so we need to get a new fd and truncate it.
1811 let f = self.open_inode(&inode_data, libc::O_NONBLOCK | libc::O_RDWR)?;
1812 unsafe { libc::ftruncate64(f.as_raw_descriptor(), attr.st_size) }
1813 }
1814 })?;
1815 }
1816
1817 if valid.intersects(SetattrValid::ATIME | SetattrValid::MTIME) {
1818 let mut tvs = [
1819 libc::timespec {
1820 tv_sec: 0,
1821 tv_nsec: libc::UTIME_OMIT,
1822 },
1823 libc::timespec {
1824 tv_sec: 0,
1825 tv_nsec: libc::UTIME_OMIT,
1826 },
1827 ];
1828
1829 if valid.contains(SetattrValid::ATIME_NOW) {
1830 tvs[0].tv_nsec = libc::UTIME_NOW;
1831 } else if valid.contains(SetattrValid::ATIME) {
1832 tvs[0].tv_sec = attr.st_atime;
1833 tvs[0].tv_nsec = attr.st_atime_nsec;
1834 }
1835
1836 if valid.contains(SetattrValid::MTIME_NOW) {
1837 tvs[1].tv_nsec = libc::UTIME_NOW;
1838 } else if valid.contains(SetattrValid::MTIME) {
1839 tvs[1].tv_sec = attr.st_mtime;
1840 tvs[1].tv_nsec = attr.st_mtime_nsec;
1841 }
1842
1843 // Safe because this doesn't modify any memory and we check the return value.
1844 syscall!(unsafe {
1845 match data {
1846 Data::Handle(_, fd) => libc::futimens(fd, tvs.as_ptr()),
1847 Data::ProcPath(ref p) => {
1848 libc::utimensat(self.proc.as_raw_descriptor(), p.as_ptr(), tvs.as_ptr(), 0)
1849 }
1850 }
1851 })?;
1852 }
1853
1854 self.do_getattr(&inode_data)
1855 }
1856
rename( &self, _ctx: Context, olddir: Inode, oldname: &CStr, newdir: Inode, newname: &CStr, flags: u32, ) -> io::Result<()>1857 fn rename(
1858 &self,
1859 _ctx: Context,
1860 olddir: Inode,
1861 oldname: &CStr,
1862 newdir: Inode,
1863 newname: &CStr,
1864 flags: u32,
1865 ) -> io::Result<()> {
1866 let old_inode = self.find_inode(olddir)?;
1867 let new_inode = self.find_inode(newdir)?;
1868
1869 // Safe because this doesn't modify any memory and we check the return value.
1870 // TODO: Switch to libc::renameat2 once https://github.com/rust-lang/libc/pull/1508 lands
1871 // and we have glibc 2.28.
1872 syscall!(unsafe {
1873 libc::syscall(
1874 libc::SYS_renameat2,
1875 old_inode.as_raw_descriptor(),
1876 oldname.as_ptr(),
1877 new_inode.as_raw_descriptor(),
1878 newname.as_ptr(),
1879 flags,
1880 )
1881 })?;
1882 Ok(())
1883 }
1884
mknod( &self, ctx: Context, parent: Inode, name: &CStr, mode: u32, rdev: u32, umask: u32, ) -> io::Result<Entry>1885 fn mknod(
1886 &self,
1887 ctx: Context,
1888 parent: Inode,
1889 name: &CStr,
1890 mode: u32,
1891 rdev: u32,
1892 umask: u32,
1893 ) -> io::Result<Entry> {
1894 let data = self.find_inode(parent)?;
1895
1896 let (_uid, _gid) = set_creds(ctx.uid, ctx.gid)?;
1897
1898 {
1899 let _scoped_umask = ScopedUmask::new(umask);
1900
1901 // Safe because this doesn't modify any memory and we check the return value.
1902 syscall!(unsafe {
1903 libc::mknodat(
1904 data.as_raw_descriptor(),
1905 name.as_ptr(),
1906 mode as libc::mode_t,
1907 rdev as libc::dev_t,
1908 )
1909 })?;
1910 }
1911
1912 self.do_lookup(&data, name)
1913 }
1914
link( &self, _ctx: Context, inode: Inode, newparent: Inode, newname: &CStr, ) -> io::Result<Entry>1915 fn link(
1916 &self,
1917 _ctx: Context,
1918 inode: Inode,
1919 newparent: Inode,
1920 newname: &CStr,
1921 ) -> io::Result<Entry> {
1922 let data = self.find_inode(inode)?;
1923 let new_inode = self.find_inode(newparent)?;
1924
1925 let path = CString::new(format!("self/fd/{}", data.as_raw_descriptor()))
1926 .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
1927
1928 // Safe because this doesn't modify any memory and we check the return value.
1929 syscall!(unsafe {
1930 libc::linkat(
1931 self.proc.as_raw_descriptor(),
1932 path.as_ptr(),
1933 new_inode.as_raw_descriptor(),
1934 newname.as_ptr(),
1935 libc::AT_SYMLINK_FOLLOW,
1936 )
1937 })?;
1938
1939 self.do_lookup(&new_inode, newname)
1940 }
1941
symlink( &self, ctx: Context, linkname: &CStr, parent: Inode, name: &CStr, ) -> io::Result<Entry>1942 fn symlink(
1943 &self,
1944 ctx: Context,
1945 linkname: &CStr,
1946 parent: Inode,
1947 name: &CStr,
1948 ) -> io::Result<Entry> {
1949 let data = self.find_inode(parent)?;
1950
1951 let (_uid, _gid) = set_creds(ctx.uid, ctx.gid)?;
1952
1953 // Safe because this doesn't modify any memory and we check the return value.
1954 syscall!(unsafe {
1955 libc::symlinkat(linkname.as_ptr(), data.as_raw_descriptor(), name.as_ptr())
1956 })?;
1957
1958 self.do_lookup(&data, name)
1959 }
1960
readlink(&self, _ctx: Context, inode: Inode) -> io::Result<Vec<u8>>1961 fn readlink(&self, _ctx: Context, inode: Inode) -> io::Result<Vec<u8>> {
1962 let data = self.find_inode(inode)?;
1963
1964 let mut buf = vec![0; libc::PATH_MAX as usize];
1965
1966 // Safe because this is a constant value and a valid C string.
1967 let empty = unsafe { CStr::from_bytes_with_nul_unchecked(EMPTY_CSTR) };
1968
1969 // Safe because this will only modify the contents of `buf` and we check the return value.
1970 let res = syscall!(unsafe {
1971 libc::readlinkat(
1972 data.as_raw_descriptor(),
1973 empty.as_ptr(),
1974 buf.as_mut_ptr() as *mut libc::c_char,
1975 buf.len(),
1976 )
1977 })?;
1978
1979 buf.resize(res as usize, 0);
1980 Ok(buf)
1981 }
1982
flush( &self, _ctx: Context, inode: Inode, handle: Handle, _lock_owner: u64, ) -> io::Result<()>1983 fn flush(
1984 &self,
1985 _ctx: Context,
1986 inode: Inode,
1987 handle: Handle,
1988 _lock_owner: u64,
1989 ) -> io::Result<()> {
1990 let data: Arc<dyn AsRawDescriptor> = if self.zero_message_open.load(Ordering::Relaxed) {
1991 self.find_inode(inode)?
1992 } else {
1993 self.find_handle(handle, inode)?
1994 };
1995
1996 // Since this method is called whenever an fd is closed in the client, we can emulate that
1997 // behavior by doing the same thing (dup-ing the fd and then immediately closing it). Safe
1998 // because this doesn't modify any memory and we check the return values.
1999 unsafe {
2000 let newfd = syscall!(libc::fcntl(
2001 data.as_raw_descriptor(),
2002 libc::F_DUPFD_CLOEXEC,
2003 0
2004 ))?;
2005
2006 syscall!(libc::close(newfd))?;
2007 }
2008 Ok(())
2009 }
2010
fsync(&self, _ctx: Context, inode: Inode, datasync: bool, handle: Handle) -> io::Result<()>2011 fn fsync(&self, _ctx: Context, inode: Inode, datasync: bool, handle: Handle) -> io::Result<()> {
2012 if self.zero_message_open.load(Ordering::Relaxed) {
2013 let data = self.find_inode(inode)?;
2014 self.do_fsync(&*data, datasync)
2015 } else {
2016 let data = self.find_handle(handle, inode)?;
2017
2018 let file = data.file.lock();
2019 self.do_fsync(&*file, datasync)
2020 }
2021 }
2022
fsyncdir( &self, _ctx: Context, inode: Inode, datasync: bool, handle: Handle, ) -> io::Result<()>2023 fn fsyncdir(
2024 &self,
2025 _ctx: Context,
2026 inode: Inode,
2027 datasync: bool,
2028 handle: Handle,
2029 ) -> io::Result<()> {
2030 if self.zero_message_opendir.load(Ordering::Relaxed) {
2031 let data = self.find_inode(inode)?;
2032 self.do_fsync(&*data, datasync)
2033 } else {
2034 let data = self.find_handle(handle, inode)?;
2035
2036 let file = data.file.lock();
2037 self.do_fsync(&*file, datasync)
2038 }
2039 }
2040
access(&self, ctx: Context, inode: Inode, mask: u32) -> io::Result<()>2041 fn access(&self, ctx: Context, inode: Inode, mask: u32) -> io::Result<()> {
2042 let data = self.find_inode(inode)?;
2043
2044 let st = stat(&*data)?;
2045 let mode = mask as i32 & (libc::R_OK | libc::W_OK | libc::X_OK);
2046
2047 if mode == libc::F_OK {
2048 // The file exists since we were able to call `stat(2)` on it.
2049 return Ok(());
2050 }
2051
2052 if (mode & libc::R_OK) != 0 {
2053 if ctx.uid != 0
2054 && (st.st_uid != ctx.uid || st.st_mode & 0o400 == 0)
2055 && (st.st_gid != ctx.gid || st.st_mode & 0o040 == 0)
2056 && st.st_mode & 0o004 == 0
2057 {
2058 return Err(io::Error::from_raw_os_error(libc::EACCES));
2059 }
2060 }
2061
2062 if (mode & libc::W_OK) != 0 {
2063 if ctx.uid != 0
2064 && (st.st_uid != ctx.uid || st.st_mode & 0o200 == 0)
2065 && (st.st_gid != ctx.gid || st.st_mode & 0o020 == 0)
2066 && st.st_mode & 0o002 == 0
2067 {
2068 return Err(io::Error::from_raw_os_error(libc::EACCES));
2069 }
2070 }
2071
2072 // root can only execute something if it is executable by one of the owner, the group, or
2073 // everyone.
2074 if (mode & libc::X_OK) != 0 {
2075 if (ctx.uid != 0 || st.st_mode & 0o111 == 0)
2076 && (st.st_uid != ctx.uid || st.st_mode & 0o100 == 0)
2077 && (st.st_gid != ctx.gid || st.st_mode & 0o010 == 0)
2078 && st.st_mode & 0o001 == 0
2079 {
2080 return Err(io::Error::from_raw_os_error(libc::EACCES));
2081 }
2082 }
2083
2084 Ok(())
2085 }
2086
setxattr( &self, _ctx: Context, inode: Inode, name: &CStr, value: &[u8], flags: u32, ) -> io::Result<()>2087 fn setxattr(
2088 &self,
2089 _ctx: Context,
2090 inode: Inode,
2091 name: &CStr,
2092 value: &[u8],
2093 flags: u32,
2094 ) -> io::Result<()> {
2095 // We can't allow the VM to set this xattr because an unprivileged process may use it to set
2096 // a privileged xattr.
2097 if self.cfg.rewrite_security_xattrs && name.to_bytes().starts_with(USER_VIRTIOFS_XATTR) {
2098 return Err(io::Error::from_raw_os_error(libc::EPERM));
2099 }
2100
2101 let data = self.find_inode(inode)?;
2102 let name = self.rewrite_xattr_name(name);
2103
2104 if data.filetype == FileType::Other {
2105 // For non-regular files and directories, we cannot open the fd normally. Instead we
2106 // emulate an _at syscall by changing the CWD to /proc, running the path based syscall,
2107 // and then setting the CWD back to the root directory.
2108 let path = CString::new(format!("self/fd/{}", data.as_raw_descriptor()))
2109 .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
2110
2111 // Safe because this doesn't modify any memory and we check the return value.
2112 syscall!(self.with_proc_chdir(|| {
2113 unsafe {
2114 libc::setxattr(
2115 path.as_ptr(),
2116 name.as_ptr(),
2117 value.as_ptr() as *const libc::c_void,
2118 value.len() as libc::size_t,
2119 flags as c_int,
2120 )
2121 }
2122 }))?;
2123 } else {
2124 // For regular files and directories, we can just use fsetxattr. Safe because this
2125 // doesn't modify any memory and we check the return value.
2126 syscall!(unsafe {
2127 libc::fsetxattr(
2128 data.as_raw_descriptor(),
2129 name.as_ptr(),
2130 value.as_ptr() as *const libc::c_void,
2131 value.len() as libc::size_t,
2132 flags as c_int,
2133 )
2134 })?;
2135 }
2136
2137 Ok(())
2138 }
2139
getxattr( &self, _ctx: Context, inode: Inode, name: &CStr, size: u32, ) -> io::Result<GetxattrReply>2140 fn getxattr(
2141 &self,
2142 _ctx: Context,
2143 inode: Inode,
2144 name: &CStr,
2145 size: u32,
2146 ) -> io::Result<GetxattrReply> {
2147 // We don't allow the VM to set this xattr so we also pretend there is no value associated
2148 // with it.
2149 if self.cfg.rewrite_security_xattrs && name.to_bytes().starts_with(USER_VIRTIOFS_XATTR) {
2150 return Err(io::Error::from_raw_os_error(libc::ENODATA));
2151 }
2152
2153 let data = self.find_inode(inode)?;
2154 let name = self.rewrite_xattr_name(name);
2155 let mut buf = vec![0u8; size as usize];
2156
2157 // Safe because this will only modify the contents of `buf`.
2158 let res = self.do_getxattr(&data, &name, &mut buf[..])?;
2159 if size == 0 {
2160 Ok(GetxattrReply::Count(res as u32))
2161 } else {
2162 buf.truncate(res as usize);
2163 Ok(GetxattrReply::Value(buf))
2164 }
2165 }
2166
listxattr(&self, _ctx: Context, inode: Inode, size: u32) -> io::Result<ListxattrReply>2167 fn listxattr(&self, _ctx: Context, inode: Inode, size: u32) -> io::Result<ListxattrReply> {
2168 let data = self.find_inode(inode)?;
2169
2170 let mut buf = vec![0u8; size as usize];
2171
2172 let res = if data.filetype == FileType::Other {
2173 // For non-regular files and directories, we cannot open the fd normally. Instead we
2174 // emulate an _at syscall by changing the CWD to /proc, running the path based syscall,
2175 // and then setting the CWD back to the root directory.
2176 let path = CString::new(format!("self/fd/{}", data.as_raw_descriptor()))
2177 .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
2178
2179 // Safe because this will only modify `buf` and we check the return value.
2180 syscall!(self.with_proc_chdir(|| unsafe {
2181 libc::listxattr(
2182 path.as_ptr(),
2183 buf.as_mut_ptr() as *mut libc::c_char,
2184 buf.len() as libc::size_t,
2185 )
2186 }))?
2187 } else {
2188 // For regular files and directories, we can just flistxattr. Safe because this will only
2189 // write to `buf` and we check the return value.
2190 syscall!(unsafe {
2191 libc::flistxattr(
2192 data.as_raw_descriptor(),
2193 buf.as_mut_ptr() as *mut libc::c_char,
2194 buf.len() as libc::size_t,
2195 )
2196 })?
2197 };
2198
2199 if size == 0 {
2200 Ok(ListxattrReply::Count(res as u32))
2201 } else {
2202 buf.truncate(res as usize);
2203
2204 if self.cfg.rewrite_security_xattrs {
2205 strip_xattr_prefix(&mut buf);
2206 }
2207 Ok(ListxattrReply::Names(buf))
2208 }
2209 }
2210
removexattr(&self, _ctx: Context, inode: Inode, name: &CStr) -> io::Result<()>2211 fn removexattr(&self, _ctx: Context, inode: Inode, name: &CStr) -> io::Result<()> {
2212 // We don't allow the VM to set this xattr so we also pretend there is no value associated
2213 // with it.
2214 if self.cfg.rewrite_security_xattrs && name.to_bytes().starts_with(USER_VIRTIOFS_XATTR) {
2215 return Err(io::Error::from_raw_os_error(libc::ENODATA));
2216 }
2217
2218 let data = self.find_inode(inode)?;
2219 let name = self.rewrite_xattr_name(name);
2220
2221 if data.filetype == FileType::Other {
2222 // For non-regular files and directories, we cannot open the fd normally. Instead we
2223 // emulate an _at syscall by changing the CWD to /proc, running the path based syscall,
2224 // and then setting the CWD back to the root directory.
2225 let path = CString::new(format!("self/fd/{}", data.as_raw_descriptor()))
2226 .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
2227
2228 // Safe because this doesn't modify any memory and we check the return value.
2229 syscall!(
2230 self.with_proc_chdir(|| unsafe { libc::removexattr(path.as_ptr(), name.as_ptr()) })
2231 )?;
2232 } else {
2233 // For regular files and directories, we can just use fremovexattr. Safe because this
2234 // doesn't modify any memory and we check the return value.
2235 syscall!(unsafe { libc::fremovexattr(data.as_raw_descriptor(), name.as_ptr()) })?;
2236 }
2237
2238 Ok(())
2239 }
2240
fallocate( &self, _ctx: Context, inode: Inode, handle: Handle, mode: u32, offset: u64, length: u64, ) -> io::Result<()>2241 fn fallocate(
2242 &self,
2243 _ctx: Context,
2244 inode: Inode,
2245 handle: Handle,
2246 mode: u32,
2247 offset: u64,
2248 length: u64,
2249 ) -> io::Result<()> {
2250 let data: Arc<dyn AsRawDescriptor> = if self.zero_message_open.load(Ordering::Relaxed) {
2251 let data = self.find_inode(inode)?;
2252
2253 {
2254 // fallocate needs a writable fd
2255 let mut file = data.file.lock();
2256 let mut flags = file.1;
2257 match flags & libc::O_ACCMODE {
2258 libc::O_RDONLY => {
2259 flags &= !libc::O_RDONLY;
2260 flags |= libc::O_RDWR;
2261
2262 // We need to get a writable handle for this file.
2263 let newfile = self.open_fd(file.0.as_raw_descriptor(), libc::O_RDWR)?;
2264 *file = (newfile, flags);
2265 }
2266 libc::O_WRONLY | libc::O_RDWR => {}
2267 _ => panic!("Unexpected flags: {:#x}", flags),
2268 }
2269 }
2270
2271 data
2272 } else {
2273 self.find_handle(handle, inode)?
2274 };
2275
2276 let fd = data.as_raw_descriptor();
2277 // Safe because this doesn't modify any memory and we check the return value.
2278 syscall!(unsafe {
2279 libc::fallocate64(
2280 fd,
2281 mode as libc::c_int,
2282 offset as libc::off64_t,
2283 length as libc::off64_t,
2284 )
2285 })?;
2286
2287 Ok(())
2288 }
2289
ioctl<R: io::Read>( &self, ctx: Context, inode: Inode, handle: Handle, _flags: IoctlFlags, cmd: u32, _arg: u64, in_size: u32, out_size: u32, r: R, ) -> io::Result<IoctlReply>2290 fn ioctl<R: io::Read>(
2291 &self,
2292 ctx: Context,
2293 inode: Inode,
2294 handle: Handle,
2295 _flags: IoctlFlags,
2296 cmd: u32,
2297 _arg: u64,
2298 in_size: u32,
2299 out_size: u32,
2300 r: R,
2301 ) -> io::Result<IoctlReply> {
2302 const GET_ENCRYPTION_POLICY_EX: u32 = FS_IOC_GET_ENCRYPTION_POLICY_EX() as u32;
2303 const GET_FSXATTR: u32 = FS_IOC_FSGETXATTR() as u32;
2304 const SET_FSXATTR: u32 = FS_IOC_FSSETXATTR() as u32;
2305 const GET_FLAGS32: u32 = FS_IOC32_GETFLAGS() as u32;
2306 const SET_FLAGS32: u32 = FS_IOC32_SETFLAGS() as u32;
2307 const GET_FLAGS64: u32 = FS_IOC64_GETFLAGS() as u32;
2308 const SET_FLAGS64: u32 = FS_IOC64_SETFLAGS() as u32;
2309 const ENABLE_VERITY: u32 = FS_IOC_ENABLE_VERITY() as u32;
2310 const MEASURE_VERITY: u32 = FS_IOC_MEASURE_VERITY() as u32;
2311
2312 match cmd {
2313 GET_ENCRYPTION_POLICY_EX => self.get_encryption_policy_ex(inode, handle, r),
2314 GET_FSXATTR => {
2315 if out_size < size_of::<fsxattr>() as u32 {
2316 Err(io::Error::from_raw_os_error(libc::ENOMEM))
2317 } else {
2318 self.get_fsxattr(inode, handle)
2319 }
2320 }
2321 SET_FSXATTR => {
2322 if in_size < size_of::<fsxattr>() as u32 {
2323 Err(io::Error::from_raw_os_error(libc::EINVAL))
2324 } else {
2325 self.set_fsxattr(ctx, inode, handle, r)
2326 }
2327 }
2328 GET_FLAGS32 | GET_FLAGS64 => {
2329 if out_size < size_of::<c_int>() as u32 {
2330 Err(io::Error::from_raw_os_error(libc::ENOMEM))
2331 } else {
2332 self.get_flags(inode, handle)
2333 }
2334 }
2335 SET_FLAGS32 | SET_FLAGS64 => {
2336 if in_size < size_of::<c_int>() as u32 {
2337 Err(io::Error::from_raw_os_error(libc::ENOMEM))
2338 } else {
2339 self.set_flags(inode, handle, r)
2340 }
2341 }
2342 ENABLE_VERITY => {
2343 if in_size < size_of::<fsverity_enable_arg>() as u32 {
2344 Err(io::Error::from_raw_os_error(libc::ENOMEM))
2345 } else {
2346 self.enable_verity(inode, handle, r)
2347 }
2348 }
2349 MEASURE_VERITY => {
2350 if in_size < size_of::<fsverity_digest>() as u32
2351 || out_size < size_of::<fsverity_digest>() as u32
2352 {
2353 Err(io::Error::from_raw_os_error(libc::ENOMEM))
2354 } else {
2355 self.measure_verity(inode, handle, r, out_size)
2356 }
2357 }
2358 _ => Err(io::Error::from_raw_os_error(libc::ENOTTY)),
2359 }
2360 }
2361
copy_file_range( &self, ctx: Context, inode_src: Inode, handle_src: Handle, offset_src: u64, inode_dst: Inode, handle_dst: Handle, offset_dst: u64, length: u64, flags: u64, ) -> io::Result<usize>2362 fn copy_file_range(
2363 &self,
2364 ctx: Context,
2365 inode_src: Inode,
2366 handle_src: Handle,
2367 offset_src: u64,
2368 inode_dst: Inode,
2369 handle_dst: Handle,
2370 offset_dst: u64,
2371 length: u64,
2372 flags: u64,
2373 ) -> io::Result<usize> {
2374 // We need to change credentials during a write so that the kernel will remove setuid or
2375 // setgid bits from the file if it was written to by someone other than the owner.
2376 let (_uid, _gid) = set_creds(ctx.uid, ctx.gid)?;
2377 let (src_data, dst_data): (Arc<dyn AsRawDescriptor>, Arc<dyn AsRawDescriptor>) =
2378 if self.zero_message_open.load(Ordering::Relaxed) {
2379 (self.find_inode(inode_src)?, self.find_inode(inode_dst)?)
2380 } else {
2381 (
2382 self.find_handle(handle_src, inode_src)?,
2383 self.find_handle(handle_dst, inode_dst)?,
2384 )
2385 };
2386
2387 let src = src_data.as_raw_descriptor();
2388 let dst = dst_data.as_raw_descriptor();
2389
2390 Ok(syscall!(unsafe {
2391 libc::syscall(
2392 libc::SYS_copy_file_range,
2393 src,
2394 &offset_src,
2395 dst,
2396 &offset_dst,
2397 length,
2398 flags,
2399 )
2400 })? as usize)
2401 }
2402
set_up_mapping<M: Mapper>( &self, _ctx: Context, inode: Self::Inode, _handle: Self::Handle, file_offset: u64, mem_offset: u64, size: usize, prot: u32, mapper: M, ) -> io::Result<()>2403 fn set_up_mapping<M: Mapper>(
2404 &self,
2405 _ctx: Context,
2406 inode: Self::Inode,
2407 _handle: Self::Handle,
2408 file_offset: u64,
2409 mem_offset: u64,
2410 size: usize,
2411 prot: u32,
2412 mapper: M,
2413 ) -> io::Result<()> {
2414 if !self.cfg.use_dax {
2415 return Err(io::Error::from_raw_os_error(libc::ENOSYS));
2416 }
2417
2418 let read = prot & libc::PROT_READ as u32 != 0;
2419 let write = prot & libc::PROT_WRITE as u32 != 0;
2420 let mmap_flags = match (read, write) {
2421 (true, true) => libc::O_RDWR,
2422 (true, false) => libc::O_RDONLY,
2423 (false, true) => libc::O_RDWR, // mmap always requires an fd opened for reading.
2424 (false, false) => return Err(io::Error::from_raw_os_error(libc::EINVAL)),
2425 };
2426
2427 let data = self.find_inode(inode)?;
2428
2429 if self.zero_message_open.load(Ordering::Relaxed) {
2430 let mut file = data.file.lock();
2431 let mut open_flags = file.1;
2432 match (mmap_flags, open_flags & libc::O_ACCMODE) {
2433 (libc::O_RDONLY, libc::O_WRONLY)
2434 | (libc::O_RDWR, libc::O_RDONLY)
2435 | (libc::O_RDWR, libc::O_WRONLY) => {
2436 // We have a read-only or write-only fd and we need to upgrade it.
2437 open_flags &= !libc::O_ACCMODE;
2438 open_flags |= libc::O_RDWR;
2439
2440 let newfile = self.open_fd(file.0.as_raw_descriptor(), libc::O_RDWR)?;
2441 *file = (newfile, open_flags);
2442 }
2443 (libc::O_RDONLY, libc::O_RDONLY)
2444 | (libc::O_RDONLY, libc::O_RDWR)
2445 | (libc::O_RDWR, libc::O_RDWR) => {}
2446 (m, o) => panic!(
2447 "Unexpected combination of access flags: ({:#x}, {:#x})",
2448 m, o
2449 ),
2450 }
2451 mapper.map(mem_offset, size, &file.0, file_offset, prot)
2452 } else {
2453 let file = self.open_inode(&data, mmap_flags | libc::O_NONBLOCK)?;
2454 mapper.map(mem_offset, size, &file, file_offset, prot)
2455 }
2456 }
2457
remove_mapping<M: Mapper>(&self, msgs: &[RemoveMappingOne], mapper: M) -> io::Result<()>2458 fn remove_mapping<M: Mapper>(&self, msgs: &[RemoveMappingOne], mapper: M) -> io::Result<()> {
2459 if !self.cfg.use_dax {
2460 return Err(io::Error::from_raw_os_error(libc::ENOSYS));
2461 }
2462
2463 for RemoveMappingOne { moffset, len } in msgs {
2464 mapper.unmap(*moffset, *len)?;
2465 }
2466 Ok(())
2467 }
2468 }
2469
2470 #[cfg(test)]
2471 mod tests {
2472 use super::*;
2473
2474 #[test]
rewrite_xattr_names()2475 fn rewrite_xattr_names() {
2476 let cfg = Config {
2477 rewrite_security_xattrs: true,
2478 ..Default::default()
2479 };
2480
2481 let p = PassthroughFs::new(cfg).expect("Failed to create PassthroughFs");
2482
2483 // Selinux shouldn't get overwritten.
2484 let selinux = unsafe { CStr::from_bytes_with_nul_unchecked(b"security.selinux\0") };
2485 assert_eq!(p.rewrite_xattr_name(selinux).to_bytes(), selinux.to_bytes());
2486
2487 // user, trusted, and system should not be changed either.
2488 let user = unsafe { CStr::from_bytes_with_nul_unchecked(b"user.foobar\0") };
2489 assert_eq!(p.rewrite_xattr_name(user).to_bytes(), user.to_bytes());
2490 let trusted = unsafe { CStr::from_bytes_with_nul_unchecked(b"trusted.foobar\0") };
2491 assert_eq!(p.rewrite_xattr_name(trusted).to_bytes(), trusted.to_bytes());
2492 let system = unsafe { CStr::from_bytes_with_nul_unchecked(b"system.foobar\0") };
2493 assert_eq!(p.rewrite_xattr_name(system).to_bytes(), system.to_bytes());
2494
2495 // sehash should be re-written.
2496 let sehash = unsafe { CStr::from_bytes_with_nul_unchecked(b"security.sehash\0") };
2497 assert_eq!(
2498 p.rewrite_xattr_name(sehash).to_bytes(),
2499 b"user.virtiofs.security.sehash"
2500 );
2501 }
2502
2503 #[test]
strip_xattr_names()2504 fn strip_xattr_names() {
2505 let only_nuls = b"\0\0\0\0\0";
2506 let mut actual = only_nuls.to_vec();
2507 strip_xattr_prefix(&mut actual);
2508 assert_eq!(&actual[..], &only_nuls[..]);
2509
2510 let no_nuls = b"security.sehashuser.virtiofs";
2511 let mut actual = no_nuls.to_vec();
2512 strip_xattr_prefix(&mut actual);
2513 assert_eq!(&actual[..], &no_nuls[..]);
2514
2515 let empty = b"";
2516 let mut actual = empty.to_vec();
2517 strip_xattr_prefix(&mut actual);
2518 assert_eq!(&actual[..], &empty[..]);
2519
2520 let no_strippable_names = b"security.selinux\0user.foobar\0system.test\0";
2521 let mut actual = no_strippable_names.to_vec();
2522 strip_xattr_prefix(&mut actual);
2523 assert_eq!(&actual[..], &no_strippable_names[..]);
2524
2525 let only_strippable_names = b"user.virtiofs.security.sehash\0user.virtiofs.security.wat\0";
2526 let mut actual = only_strippable_names.to_vec();
2527 strip_xattr_prefix(&mut actual);
2528 assert_eq!(&actual[..], b"security.sehash\0security.wat\0");
2529
2530 let mixed_names = b"user.virtiofs.security.sehash\0security.selinux\0user.virtiofs.security.wat\0user.foobar\0";
2531 let mut actual = mixed_names.to_vec();
2532 strip_xattr_prefix(&mut actual);
2533 let expected = b"security.sehash\0security.selinux\0security.wat\0user.foobar\0";
2534 assert_eq!(&actual[..], &expected[..]);
2535
2536 let no_nul_with_prefix = b"user.virtiofs.security.sehash";
2537 let mut actual = no_nul_with_prefix.to_vec();
2538 strip_xattr_prefix(&mut actual);
2539 assert_eq!(&actual[..], b"security.sehash");
2540 }
2541 }
2542