• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2019 The Chromium OS Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 use std::{
6     borrow::Cow,
7     cmp,
8     collections::{btree_map, BTreeMap},
9     ffi::{CStr, CString},
10     fs::File,
11     io,
12     mem::{self, size_of, MaybeUninit},
13     os::raw::{c_int, c_long},
14     ptr::{addr_of, addr_of_mut},
15     str::FromStr,
16     sync::{
17         atomic::{AtomicBool, AtomicU64, Ordering},
18         Arc,
19     },
20     time::Duration,
21 };
22 
23 use base::{
24     error, ioctl_ior_nr, ioctl_iow_nr, ioctl_iowr_nr, ioctl_with_mut_ptr, ioctl_with_ptr, syscall,
25     AsRawDescriptor, FileFlags, FromRawDescriptor, RawDescriptor,
26 };
27 use data_model::DataInit;
28 use fuse::filesystem::{
29     Context, DirectoryIterator, Entry, FileSystem, FsOptions, GetxattrReply, IoctlFlags,
30     IoctlReply, ListxattrReply, OpenOptions, RemoveMappingOne, SetattrValid, ZeroCopyReader,
31     ZeroCopyWriter, ROOT_ID,
32 };
33 use fuse::sys::WRITE_KILL_PRIV;
34 use fuse::Mapper;
35 use sync::Mutex;
36 
37 #[cfg(feature = "chromeos")]
38 use {
39     protobuf::Message,
40     system_api::client::OrgChromiumArcQuota,
41     system_api::UserDataAuth::{
42         SetMediaRWDataFileProjectIdReply, SetMediaRWDataFileProjectIdRequest,
43     },
44 };
45 
46 use crate::virtio::fs::caps::{Capability, Caps, Set as CapSet, Value as CapValue};
47 use crate::virtio::fs::multikey::MultikeyBTreeMap;
48 use crate::virtio::fs::read_dir::ReadDir;
49 
50 const EMPTY_CSTR: &[u8] = b"\0";
51 const ROOT_CSTR: &[u8] = b"/\0";
52 const PROC_CSTR: &[u8] = b"/proc\0";
53 
54 const USER_VIRTIOFS_XATTR: &[u8] = b"user.virtiofs.";
55 const SECURITY_XATTR: &[u8] = b"security.";
56 const SELINUX_XATTR: &[u8] = b"security.selinux";
57 
58 const FSCRYPT_KEY_DESCRIPTOR_SIZE: usize = 8;
59 const FSCRYPT_KEY_IDENTIFIER_SIZE: usize = 16;
60 
61 // 25 seconds is the default timeout for dbus-send.
62 #[cfg(feature = "chromeos")]
63 const DEFAULT_DBUS_TIMEOUT: Duration = Duration::from_secs(25);
64 
65 #[repr(C)]
66 #[derive(Clone, Copy)]
67 struct fscrypt_policy_v1 {
68     _version: u8,
69     _contents_encryption_mode: u8,
70     _filenames_encryption_mode: u8,
71     _flags: u8,
72     _master_key_descriptor: [u8; FSCRYPT_KEY_DESCRIPTOR_SIZE],
73 }
74 unsafe impl DataInit for fscrypt_policy_v1 {}
75 
76 #[repr(C)]
77 #[derive(Clone, Copy)]
78 struct fscrypt_policy_v2 {
79     _version: u8,
80     _contents_encryption_mode: u8,
81     _filenames_encryption_mode: u8,
82     _flags: u8,
83     __reserved: [u8; 4],
84     master_key_identifier: [u8; FSCRYPT_KEY_IDENTIFIER_SIZE],
85 }
86 unsafe impl DataInit for fscrypt_policy_v2 {}
87 
88 #[repr(C)]
89 #[derive(Copy, Clone)]
90 union fscrypt_policy {
91     _version: u8,
92     _v1: fscrypt_policy_v1,
93     _v2: fscrypt_policy_v2,
94 }
95 unsafe impl DataInit for fscrypt_policy {}
96 
97 #[repr(C)]
98 #[derive(Copy, Clone)]
99 struct fscrypt_get_policy_ex_arg {
100     policy_size: u64,       /* input/output */
101     policy: fscrypt_policy, /* output */
102 }
103 unsafe impl DataInit for fscrypt_get_policy_ex_arg {}
104 
105 ioctl_iowr_nr!(FS_IOC_GET_ENCRYPTION_POLICY_EX, 'f' as u32, 22, [u8; 9]);
106 
107 #[repr(C)]
108 #[derive(Clone, Copy)]
109 struct fsxattr {
110     fsx_xflags: u32,     /* xflags field value (get/set) */
111     fsx_extsize: u32,    /* extsize field value (get/set)*/
112     fsx_nextents: u32,   /* nextents field value (get)	*/
113     fsx_projid: u32,     /* project identifier (get/set) */
114     fsx_cowextsize: u32, /* CoW extsize field value (get/set)*/
115     fsx_pad: [u8; 8],
116 }
117 unsafe impl DataInit for fsxattr {}
118 
119 ioctl_ior_nr!(FS_IOC_FSGETXATTR, 'X' as u32, 31, fsxattr);
120 ioctl_iow_nr!(FS_IOC_FSSETXATTR, 'X' as u32, 32, fsxattr);
121 
122 ioctl_ior_nr!(FS_IOC_GETFLAGS, 'f' as u32, 1, c_long);
123 ioctl_iow_nr!(FS_IOC_SETFLAGS, 'f' as u32, 2, c_long);
124 
125 ioctl_ior_nr!(FS_IOC32_GETFLAGS, 'f' as u32, 1, u32);
126 ioctl_iow_nr!(FS_IOC32_SETFLAGS, 'f' as u32, 2, u32);
127 
128 ioctl_ior_nr!(FS_IOC64_GETFLAGS, 'f' as u32, 1, u64);
129 ioctl_iow_nr!(FS_IOC64_SETFLAGS, 'f' as u32, 2, u64);
130 
131 #[repr(C)]
132 #[derive(Clone, Copy)]
133 struct fsverity_enable_arg {
134     _version: u32,
135     _hash_algorithm: u32,
136     _block_size: u32,
137     salt_size: u32,
138     salt_ptr: u64,
139     sig_size: u32,
140     __reserved1: u32,
141     sig_ptr: u64,
142     __reserved2: [u64; 11],
143 }
144 unsafe impl DataInit for fsverity_enable_arg {}
145 
146 #[repr(C)]
147 #[derive(Clone, Copy)]
148 struct fsverity_digest {
149     _digest_algorithm: u16,
150     digest_size: u16,
151     // __u8 digest[];
152 }
153 unsafe impl DataInit for fsverity_digest {}
154 
155 ioctl_iow_nr!(FS_IOC_ENABLE_VERITY, 'f' as u32, 133, fsverity_enable_arg);
156 ioctl_iowr_nr!(FS_IOC_MEASURE_VERITY, 'f' as u32, 134, fsverity_digest);
157 
158 type Inode = u64;
159 type Handle = u64;
160 
161 #[derive(Clone, Copy, PartialOrd, Ord, PartialEq, Eq)]
162 struct InodeAltKey {
163     ino: libc::ino64_t,
164     dev: libc::dev_t,
165 }
166 
167 #[derive(PartialEq, Eq)]
168 enum FileType {
169     Regular,
170     Directory,
171     Other,
172 }
173 
174 impl From<libc::mode_t> for FileType {
from(mode: libc::mode_t) -> Self175     fn from(mode: libc::mode_t) -> Self {
176         match mode & libc::S_IFMT {
177             libc::S_IFREG => FileType::Regular,
178             libc::S_IFDIR => FileType::Directory,
179             _ => FileType::Other,
180         }
181     }
182 }
183 
184 struct InodeData {
185     inode: Inode,
186     // (File, open_flags)
187     file: Mutex<(File, libc::c_int)>,
188     refcount: AtomicU64,
189     filetype: FileType,
190 }
191 
192 impl AsRawDescriptor for InodeData {
as_raw_descriptor(&self) -> RawDescriptor193     fn as_raw_descriptor(&self) -> RawDescriptor {
194         self.file.lock().0.as_raw_descriptor()
195     }
196 }
197 
198 struct HandleData {
199     inode: Inode,
200     file: Mutex<File>,
201 }
202 
203 impl AsRawDescriptor for HandleData {
as_raw_descriptor(&self) -> RawDescriptor204     fn as_raw_descriptor(&self) -> RawDescriptor {
205         self.file.lock().as_raw_descriptor()
206     }
207 }
208 
209 macro_rules! scoped_cred {
210     ($name:ident, $ty:ty, $syscall_nr:expr) => {
211         #[derive(Debug)]
212         struct $name {
213             old: $ty,
214         }
215 
216         impl $name {
217             // Changes the effective uid/gid of the current thread to `val`. Changes the thread's
218             // credentials back to `old` when the returned struct is dropped.
219             fn new(val: $ty, old: $ty) -> io::Result<Option<$name>> {
220                 if val == old {
221                     // Nothing to do since we already have the correct value.
222                     return Ok(None);
223                 }
224 
225                 // We want credential changes to be per-thread because otherwise
226                 // we might interfere with operations being carried out on other
227                 // threads with different uids/gids.  However, posix requires that
228                 // all threads in a process share the same credentials.  To do this
229                 // libc uses signals to ensure that when one thread changes its
230                 // credentials the other threads do the same thing.
231                 //
232                 // So instead we invoke the syscall directly in order to get around
233                 // this limitation.  Another option is to use the setfsuid and
234                 // setfsgid systems calls.   However since those calls have no way to
235                 // return an error, it's preferable to do this instead.
236 
237                 // This call is safe because it doesn't modify any memory and we
238                 // check the return value.
239                 let res = unsafe { libc::syscall($syscall_nr, -1, val, -1) };
240                 if res == 0 {
241                     Ok(Some($name { old }))
242                 } else {
243                     Err(io::Error::last_os_error())
244                 }
245             }
246         }
247 
248         impl Drop for $name {
249             fn drop(&mut self) {
250                 let res = unsafe { libc::syscall($syscall_nr, -1, self.old, -1) };
251                 if res < 0 {
252                     error!(
253                         "failed to change credentials back to {}: {}",
254                         self.old,
255                         io::Error::last_os_error(),
256                     );
257                 }
258             }
259         }
260     };
261 }
262 #[cfg(not(target_arch = "arm"))]
263 scoped_cred!(ScopedUid, libc::uid_t, libc::SYS_setresuid);
264 #[cfg(target_arch = "arm")]
265 scoped_cred!(ScopedUid, libc::uid_t, libc::SYS_setresuid32);
266 
267 #[cfg(not(target_arch = "arm"))]
268 scoped_cred!(ScopedGid, libc::gid_t, libc::SYS_setresgid);
269 #[cfg(target_arch = "arm")]
270 scoped_cred!(ScopedGid, libc::gid_t, libc::SYS_setresgid32);
271 
272 #[cfg(not(target_arch = "arm"))]
273 const SYS_GETEUID: libc::c_long = libc::SYS_geteuid;
274 #[cfg(target_arch = "arm")]
275 const SYS_GETEUID: libc::c_long = libc::SYS_geteuid32;
276 
277 #[cfg(not(target_arch = "arm"))]
278 const SYS_GETEGID: libc::c_long = libc::SYS_getegid;
279 #[cfg(target_arch = "arm")]
280 const SYS_GETEGID: libc::c_long = libc::SYS_getegid32;
281 
282 thread_local! {
283     // Both these calls are safe because they take no parameters, and only return an integer value.
284     // The kernel also guarantees that they can never fail.
285     static THREAD_EUID: libc::uid_t = unsafe { libc::syscall(SYS_GETEUID) as libc::uid_t };
286     static THREAD_EGID: libc::gid_t = unsafe { libc::syscall(SYS_GETEGID) as libc::gid_t };
287 }
288 
set_creds( uid: libc::uid_t, gid: libc::gid_t, ) -> io::Result<(Option<ScopedUid>, Option<ScopedGid>)>289 fn set_creds(
290     uid: libc::uid_t,
291     gid: libc::gid_t,
292 ) -> io::Result<(Option<ScopedUid>, Option<ScopedGid>)> {
293     let olduid = THREAD_EUID.with(|uid| *uid);
294     let oldgid = THREAD_EGID.with(|gid| *gid);
295 
296     // We have to change the gid before we change the uid because if we change the uid first then we
297     // lose the capability to change the gid.  However changing back can happen in any order.
298     ScopedGid::new(gid, oldgid).and_then(|gid| Ok((ScopedUid::new(uid, olduid)?, gid)))
299 }
300 
301 struct ScopedUmask {
302     old: libc::mode_t,
303     mask: libc::mode_t,
304 }
305 
306 impl ScopedUmask {
new(mask: libc::mode_t) -> ScopedUmask307     fn new(mask: libc::mode_t) -> ScopedUmask {
308         ScopedUmask {
309             // Safe because this doesn't modify any memory and always succeeds.
310             old: unsafe { libc::umask(mask) },
311             mask,
312         }
313     }
314 }
315 
316 impl Drop for ScopedUmask {
drop(&mut self)317     fn drop(&mut self) {
318         // Safe because this doesn't modify any memory and always succeeds.
319         let previous = unsafe { libc::umask(self.old) };
320         debug_assert_eq!(
321             previous, self.mask,
322             "umask changed while holding ScopedUmask"
323         );
324     }
325 }
326 
327 struct ScopedFsetid(Caps);
328 impl Drop for ScopedFsetid {
drop(&mut self)329     fn drop(&mut self) {
330         if let Err(e) = raise_cap_fsetid(&mut self.0) {
331             error!(
332                 "Failed to restore CAP_FSETID: {}.  Some operations may be broken.",
333                 e
334             )
335         }
336     }
337 }
338 
raise_cap_fsetid(c: &mut Caps) -> io::Result<()>339 fn raise_cap_fsetid(c: &mut Caps) -> io::Result<()> {
340     c.update(&[Capability::Fsetid], CapSet::Effective, CapValue::Set)?;
341     c.apply()
342 }
343 
344 // Drops CAP_FSETID from the effective set for the current thread and returns an RAII guard that
345 // adds the capability back when it is dropped.
drop_cap_fsetid() -> io::Result<ScopedFsetid>346 fn drop_cap_fsetid() -> io::Result<ScopedFsetid> {
347     let mut caps = Caps::for_current_thread()?;
348     caps.update(&[Capability::Fsetid], CapSet::Effective, CapValue::Clear)?;
349     caps.apply()?;
350     Ok(ScopedFsetid(caps))
351 }
352 
ebadf() -> io::Error353 fn ebadf() -> io::Error {
354     io::Error::from_raw_os_error(libc::EBADF)
355 }
356 
stat<F: AsRawDescriptor + ?Sized>(f: &F) -> io::Result<libc::stat64>357 fn stat<F: AsRawDescriptor + ?Sized>(f: &F) -> io::Result<libc::stat64> {
358     let mut st = MaybeUninit::<libc::stat64>::zeroed();
359 
360     // Safe because this is a constant value and a valid C string.
361     let pathname = unsafe { CStr::from_bytes_with_nul_unchecked(EMPTY_CSTR) };
362 
363     // Safe because the kernel will only write data in `st` and we check the return
364     // value.
365     syscall!(unsafe {
366         libc::fstatat64(
367             f.as_raw_descriptor(),
368             pathname.as_ptr(),
369             st.as_mut_ptr(),
370             libc::AT_EMPTY_PATH | libc::AT_SYMLINK_NOFOLLOW,
371         )
372     })?;
373 
374     // Safe because the kernel guarantees that the struct is now fully initialized.
375     Ok(unsafe { st.assume_init() })
376 }
377 
statat<D: AsRawDescriptor>(dir: &D, name: &CStr) -> io::Result<libc::stat64>378 fn statat<D: AsRawDescriptor>(dir: &D, name: &CStr) -> io::Result<libc::stat64> {
379     let mut st = MaybeUninit::<libc::stat64>::zeroed();
380 
381     // Safe because the kernel will only write data in `st` and we check the return
382     // value.
383     syscall!(unsafe {
384         libc::fstatat64(
385             dir.as_raw_descriptor(),
386             name.as_ptr(),
387             st.as_mut_ptr(),
388             libc::AT_SYMLINK_NOFOLLOW,
389         )
390     })?;
391 
392     // Safe because the kernel guarantees that the struct is now fully initialized.
393     Ok(unsafe { st.assume_init() })
394 }
395 
396 /// The caching policy that the file system should report to the FUSE client. By default the FUSE
397 /// protocol uses close-to-open consistency. This means that any cached contents of the file are
398 /// invalidated the next time that file is opened.
399 #[derive(Debug, Clone, Eq, PartialEq)]
400 pub enum CachePolicy {
401     /// The client should never cache file data and all I/O should be directly forwarded to the
402     /// server. This policy must be selected when file contents may change without the knowledge of
403     /// the FUSE client (i.e., the file system does not have exclusive access to the directory).
404     Never,
405 
406     /// The client is free to choose when and how to cache file data. This is the default policy and
407     /// uses close-to-open consistency as described in the enum documentation.
408     Auto,
409 
410     /// The client should always cache file data. This means that the FUSE client will not
411     /// invalidate any cached data that was returned by the file system the last time the file was
412     /// opened. This policy should only be selected when the file system has exclusive access to the
413     /// directory.
414     Always,
415 }
416 
417 impl FromStr for CachePolicy {
418     type Err = &'static str;
419 
from_str(s: &str) -> Result<Self, Self::Err>420     fn from_str(s: &str) -> Result<Self, Self::Err> {
421         match s {
422             "never" | "Never" | "NEVER" => Ok(CachePolicy::Never),
423             "auto" | "Auto" | "AUTO" => Ok(CachePolicy::Auto),
424             "always" | "Always" | "ALWAYS" => Ok(CachePolicy::Always),
425             _ => Err("invalid cache policy"),
426         }
427     }
428 }
429 
430 impl Default for CachePolicy {
default() -> Self431     fn default() -> Self {
432         CachePolicy::Auto
433     }
434 }
435 
436 /// Options that configure the behavior of the file system.
437 #[derive(Debug, Clone)]
438 pub struct Config {
439     /// How long the FUSE client should consider directory entries to be valid. If the contents of a
440     /// directory can only be modified by the FUSE client (i.e., the file system has exclusive
441     /// access), then this should be a large value.
442     ///
443     /// The default value for this option is 5 seconds.
444     pub entry_timeout: Duration,
445 
446     /// How long the FUSE client should consider file and directory attributes to be valid. If the
447     /// attributes of a file or directory can only be modified by the FUSE client (i.e., the file
448     /// system has exclusive access), then this should be set to a large value.
449     ///
450     /// The default value for this option is 5 seconds.
451     pub attr_timeout: Duration,
452 
453     /// The caching policy the file system should use. See the documentation of `CachePolicy` for
454     /// more details.
455     pub cache_policy: CachePolicy,
456 
457     /// Whether the file system should enabled writeback caching. This can improve performance as it
458     /// allows the FUSE client to cache and coalesce multiple writes before sending them to the file
459     /// system. However, enabling this option can increase the risk of data corruption if the file
460     /// contents can change without the knowledge of the FUSE client (i.e., the server does **NOT**
461     /// have exclusive access). Additionally, the file system should have read access to all files
462     /// in the directory it is serving as the FUSE client may send read requests even for files
463     /// opened with `O_WRONLY`.
464     ///
465     /// Therefore callers should only enable this option when they can guarantee that: 1) the file
466     /// system has exclusive access to the directory and 2) the file system has read permissions for
467     /// all files in that directory.
468     ///
469     /// The default value for this option is `false`.
470     pub writeback: bool,
471 
472     /// Controls whether security.* xattrs (except for security.selinux) are re-written. When this
473     /// is set to true, the server will add a "user.virtiofs" prefix to xattrs in the security
474     /// namespace. Setting these xattrs requires CAP_SYS_ADMIN in the namespace where the file
475     /// system was mounted and since the server usually runs in an unprivileged user namespace, it's
476     /// unlikely to have that capability.
477     ///
478     /// The default value for this option is `false`.
479     pub rewrite_security_xattrs: bool,
480 
481     /// Use case-insensitive lookups for directory entries (ASCII only).
482     ///
483     /// The default value for this option is `false`.
484     pub ascii_casefold: bool,
485 
486     // UIDs which are privileged to perform quota-related operations. We cannot perform a CAP_FOWNER
487     // check so we consult this list when the VM tries to set the project quota and the process uid
488     // doesn't match the owner uid. In that case, all uids in this list are treated as if they have
489     // CAP_FOWNER.
490     #[cfg(feature = "chromeos")]
491     pub privileged_quota_uids: Vec<libc::uid_t>,
492 
493     /// Use DAX for shared files.
494     ///
495     /// Enabling DAX can improve performance for frequently accessed files by mapping regions of the
496     /// file directly into the VM's memory region, allowing direct access with the cost of slightly
497     /// increased latency the first time the file is accessed. Additionally, since the mapping is
498     /// shared directly from the host kernel's file cache, enabling DAX can improve performance even
499     /// when the cache policy is `Never`.
500     ///
501     /// The default value for this option is `false`.
502     pub use_dax: bool,
503 
504     /// Enable support for POSIX acls.
505     ///
506     /// Enable POSIX acl support for the shared directory. This requires that the underlying file
507     /// system also supports POSIX acls.
508     ///
509     /// The default value for this option is `true`.
510     pub posix_acl: bool,
511 }
512 
513 impl Default for Config {
default() -> Self514     fn default() -> Self {
515         Config {
516             entry_timeout: Duration::from_secs(5),
517             attr_timeout: Duration::from_secs(5),
518             cache_policy: Default::default(),
519             writeback: false,
520             rewrite_security_xattrs: false,
521             ascii_casefold: false,
522             #[cfg(feature = "chromeos")]
523             privileged_quota_uids: Default::default(),
524             use_dax: false,
525             posix_acl: true,
526         }
527     }
528 }
529 
530 /// A file system that simply "passes through" all requests it receives to the underlying file
531 /// system. To keep the implementation simple it servers the contents of its root directory. Users
532 /// that wish to serve only a specific directory should set up the environment so that that
533 /// directory ends up as the root of the file system process. One way to accomplish this is via a
534 /// combination of mount namespaces and the pivot_root system call.
535 pub struct PassthroughFs {
536     // File descriptors for various points in the file system tree.
537     inodes: Mutex<MultikeyBTreeMap<Inode, InodeAltKey, Arc<InodeData>>>,
538     next_inode: AtomicU64,
539 
540     // File descriptors for open files and directories. Unlike the fds in `inodes`, these _can_ be
541     // used for reading and writing data.
542     handles: Mutex<BTreeMap<Handle, Arc<HandleData>>>,
543     next_handle: AtomicU64,
544 
545     // File descriptor pointing to the `/proc` directory. This is used to convert an fd from
546     // `inodes` into one that can go into `handles`. This is accomplished by reading the
547     // `self/fd/{}` symlink. We keep an open fd here in case the file system tree that we are meant
548     // to be serving doesn't have access to `/proc`.
549     proc: File,
550 
551     // Whether writeback caching is enabled for this directory. This will only be true when
552     // `cfg.writeback` is true and `init` was called with `FsOptions::WRITEBACK_CACHE`.
553     writeback: AtomicBool,
554 
555     // Whether zero message opens are supported by the kernel driver.
556     zero_message_open: AtomicBool,
557 
558     // Whether zero message opendir is supported by the kernel driver.
559     zero_message_opendir: AtomicBool,
560 
561     // Used to communicate with other processes using D-Bus.
562     #[cfg(feature = "chromeos")]
563     dbus_connection: Option<Mutex<dbus::blocking::Connection>>,
564     #[cfg(feature = "chromeos")]
565     dbus_fd: Option<std::os::unix::io::RawFd>,
566 
567     cfg: Config,
568 }
569 
570 impl PassthroughFs {
new(cfg: Config) -> io::Result<PassthroughFs>571     pub fn new(cfg: Config) -> io::Result<PassthroughFs> {
572         // Safe because this is a constant value and a valid C string.
573         let proc_cstr = unsafe { CStr::from_bytes_with_nul_unchecked(PROC_CSTR) };
574 
575         // Safe because this doesn't modify any memory and we check the return value.
576         let raw_descriptor = syscall!(unsafe {
577             libc::openat64(
578                 libc::AT_FDCWD,
579                 proc_cstr.as_ptr(),
580                 libc::O_PATH | libc::O_NOFOLLOW | libc::O_CLOEXEC,
581             )
582         })?;
583 
584         // Privileged UIDs can use D-Bus to perform some operations.
585         #[cfg(feature = "chromeos")]
586         let (dbus_connection, dbus_fd) = if cfg.privileged_quota_uids.is_empty() {
587             (None, None)
588         } else {
589             let mut channel = dbus::channel::Channel::get_private(dbus::channel::BusType::System)
590                 .map_err(|e| io::Error::new(io::ErrorKind::Other, e))?;
591             channel.set_watch_enabled(true);
592             let dbus_fd = channel.watch().fd;
593             channel.set_watch_enabled(false);
594             (
595                 Some(Mutex::new(dbus::blocking::Connection::from(channel))),
596                 Some(dbus_fd),
597             )
598         };
599 
600         // Safe because we just opened this descriptor.
601         let proc = unsafe { File::from_raw_descriptor(raw_descriptor) };
602 
603         Ok(PassthroughFs {
604             inodes: Mutex::new(MultikeyBTreeMap::new()),
605             next_inode: AtomicU64::new(ROOT_ID + 1),
606 
607             handles: Mutex::new(BTreeMap::new()),
608             next_handle: AtomicU64::new(1),
609 
610             proc,
611 
612             writeback: AtomicBool::new(false),
613             zero_message_open: AtomicBool::new(false),
614             zero_message_opendir: AtomicBool::new(false),
615 
616             #[cfg(feature = "chromeos")]
617             dbus_connection,
618             #[cfg(feature = "chromeos")]
619             dbus_fd,
620 
621             cfg,
622         })
623     }
624 
cfg(&self) -> &Config625     pub fn cfg(&self) -> &Config {
626         &self.cfg
627     }
628 
keep_rds(&self) -> Vec<RawDescriptor>629     pub fn keep_rds(&self) -> Vec<RawDescriptor> {
630         #[cfg_attr(not(feature = "chromeos"), allow(unused_mut))]
631         let mut keep_rds = vec![self.proc.as_raw_descriptor()];
632         #[cfg(feature = "chromeos")]
633         if let Some(fd) = self.dbus_fd {
634             keep_rds.push(fd);
635         }
636         keep_rds
637     }
638 
rewrite_xattr_name<'xattr>(&self, name: &'xattr CStr) -> Cow<'xattr, CStr>639     fn rewrite_xattr_name<'xattr>(&self, name: &'xattr CStr) -> Cow<'xattr, CStr> {
640         if !self.cfg.rewrite_security_xattrs {
641             return Cow::Borrowed(name);
642         }
643 
644         // Does not include nul-terminator.
645         let buf = name.to_bytes();
646         if !buf.starts_with(SECURITY_XATTR) || buf == SELINUX_XATTR {
647             return Cow::Borrowed(name);
648         }
649 
650         let mut newname = USER_VIRTIOFS_XATTR.to_vec();
651         newname.extend_from_slice(buf);
652 
653         // The unwrap is safe here because the prefix doesn't contain any interior nul-bytes and the
654         // to_bytes() call above will not return a byte slice with any interior nul-bytes either.
655         Cow::Owned(CString::new(newname).expect("Failed to re-write xattr name"))
656     }
657 
find_inode(&self, inode: Inode) -> io::Result<Arc<InodeData>>658     fn find_inode(&self, inode: Inode) -> io::Result<Arc<InodeData>> {
659         self.inodes
660             .lock()
661             .get(&inode)
662             .map(Arc::clone)
663             .ok_or_else(ebadf)
664     }
665 
find_handle(&self, handle: Handle, inode: Inode) -> io::Result<Arc<HandleData>>666     fn find_handle(&self, handle: Handle, inode: Inode) -> io::Result<Arc<HandleData>> {
667         self.handles
668             .lock()
669             .get(&handle)
670             .filter(|hd| hd.inode == inode)
671             .map(Arc::clone)
672             .ok_or_else(ebadf)
673     }
674 
open_fd(&self, fd: RawDescriptor, flags: i32) -> io::Result<File>675     fn open_fd(&self, fd: RawDescriptor, flags: i32) -> io::Result<File> {
676         let pathname = CString::new(format!("self/fd/{}", fd))
677             .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
678 
679         // Safe because this doesn't modify any memory and we check the return value. We don't
680         // really check `flags` because if the kernel can't handle poorly specified flags then we
681         // have much bigger problems. Also, clear the `O_NOFOLLOW` flag if it is set since we need
682         // to follow the `/proc/self/fd` symlink to get the file.
683         let raw_descriptor = syscall!(unsafe {
684             libc::openat64(
685                 self.proc.as_raw_descriptor(),
686                 pathname.as_ptr(),
687                 (flags | libc::O_CLOEXEC) & !(libc::O_NOFOLLOW | libc::O_DIRECT),
688             )
689         })?;
690 
691         // Safe because we just opened this descriptor.
692         Ok(unsafe { File::from_raw_descriptor(raw_descriptor) })
693     }
694 
open_inode(&self, inode: &InodeData, mut flags: i32) -> io::Result<File>695     fn open_inode(&self, inode: &InodeData, mut flags: i32) -> io::Result<File> {
696         // When writeback caching is enabled, the kernel may send read requests even if the
697         // userspace program opened the file write-only. So we need to ensure that we have opened
698         // the file for reading as well as writing.
699         let writeback = self.writeback.load(Ordering::Relaxed);
700         if writeback && flags & libc::O_ACCMODE == libc::O_WRONLY {
701             flags &= !libc::O_ACCMODE;
702             flags |= libc::O_RDWR;
703         }
704 
705         // When writeback caching is enabled the kernel is responsible for handling `O_APPEND`.
706         // However, this breaks atomicity as the file may have changed on disk, invalidating the
707         // cached copy of the data in the kernel and the offset that the kernel thinks is the end of
708         // the file. Just allow this for now as it is the user's responsibility to enable writeback
709         // caching only for directories that are not shared. It also means that we need to clear the
710         // `O_APPEND` flag.
711         if writeback && flags & libc::O_APPEND != 0 {
712             flags &= !libc::O_APPEND;
713         }
714 
715         self.open_fd(inode.as_raw_descriptor(), flags)
716     }
717 
718     // Creates a new entry for `f` or increases the refcount of the existing entry for `f`.
add_entry(&self, f: File, st: libc::stat64, open_flags: libc::c_int) -> Entry719     fn add_entry(&self, f: File, st: libc::stat64, open_flags: libc::c_int) -> Entry {
720         let altkey = InodeAltKey {
721             ino: st.st_ino,
722             dev: st.st_dev,
723         };
724         let data = self.inodes.lock().get_alt(&altkey).map(Arc::clone);
725 
726         let inode = if let Some(data) = data {
727             // Matches with the release store in `forget`.
728             data.refcount.fetch_add(1, Ordering::Acquire);
729             data.inode
730         } else {
731             // There is a possible race here where 2 threads end up adding the same file
732             // into the inode list.  However, since each of those will get a unique Inode
733             // value and unique file descriptors this shouldn't be that much of a problem.
734             let inode = self.next_inode.fetch_add(1, Ordering::Relaxed);
735             self.inodes.lock().insert(
736                 inode,
737                 InodeAltKey {
738                     ino: st.st_ino,
739                     dev: st.st_dev,
740                 },
741                 Arc::new(InodeData {
742                     inode,
743                     file: Mutex::new((f, open_flags)),
744                     refcount: AtomicU64::new(1),
745                     filetype: st.st_mode.into(),
746                 }),
747             );
748 
749             inode
750         };
751 
752         Entry {
753             inode,
754             generation: 0,
755             attr: st,
756             attr_timeout: self.cfg.attr_timeout,
757             entry_timeout: self.cfg.entry_timeout,
758         }
759     }
760 
761     // Performs an ascii case insensitive lookup.
ascii_casefold_lookup(&self, parent: &InodeData, name: &[u8]) -> io::Result<Entry>762     fn ascii_casefold_lookup(&self, parent: &InodeData, name: &[u8]) -> io::Result<Entry> {
763         let mut buf = [0u8; 1024];
764         let mut offset = 0;
765         loop {
766             let mut read_dir = ReadDir::new(parent, offset, &mut buf[..])?;
767             if read_dir.remaining() == 0 {
768                 break;
769             }
770 
771             while let Some(entry) = read_dir.next() {
772                 offset = entry.offset as libc::off64_t;
773                 if name.eq_ignore_ascii_case(entry.name.to_bytes()) {
774                     return self.do_lookup(parent, entry.name);
775                 }
776             }
777         }
778         Err(io::Error::from_raw_os_error(libc::ENOENT))
779     }
780 
do_lookup(&self, parent: &InodeData, name: &CStr) -> io::Result<Entry>781     fn do_lookup(&self, parent: &InodeData, name: &CStr) -> io::Result<Entry> {
782         let st = statat(parent, name)?;
783 
784         let mut flags = libc::O_RDONLY | libc::O_NOFOLLOW | libc::O_CLOEXEC;
785         match FileType::from(st.st_mode) {
786             FileType::Regular => {}
787             FileType::Directory => flags |= libc::O_DIRECTORY,
788             FileType::Other => flags |= libc::O_PATH,
789         }
790 
791         // Safe because this doesn't modify any memory and we check the return value.
792         let f = unsafe {
793             File::from_raw_descriptor(syscall!(libc::openat64(
794                 parent.as_raw_descriptor(),
795                 name.as_ptr(),
796                 flags
797             ))?)
798         };
799 
800         Ok(self.add_entry(f, st, flags))
801     }
802 
do_open(&self, inode: Inode, flags: u32) -> io::Result<(Option<Handle>, OpenOptions)>803     fn do_open(&self, inode: Inode, flags: u32) -> io::Result<(Option<Handle>, OpenOptions)> {
804         let inode_data = self.find_inode(inode)?;
805 
806         let file = Mutex::new(self.open_inode(&inode_data, flags as i32)?);
807 
808         let handle = self.next_handle.fetch_add(1, Ordering::Relaxed);
809         let data = HandleData { inode, file };
810 
811         self.handles.lock().insert(handle, Arc::new(data));
812 
813         let mut opts = OpenOptions::empty();
814         match self.cfg.cache_policy {
815             // We only set the direct I/O option on files.
816             CachePolicy::Never => opts.set(
817                 OpenOptions::DIRECT_IO,
818                 flags & (libc::O_DIRECTORY as u32) == 0,
819             ),
820             CachePolicy::Always => {
821                 opts |= if flags & (libc::O_DIRECTORY as u32) == 0 {
822                     OpenOptions::KEEP_CACHE
823                 } else {
824                     OpenOptions::CACHE_DIR
825                 }
826             }
827             _ => {}
828         };
829 
830         Ok((Some(handle), opts))
831     }
832 
do_release(&self, inode: Inode, handle: Handle) -> io::Result<()>833     fn do_release(&self, inode: Inode, handle: Handle) -> io::Result<()> {
834         let mut handles = self.handles.lock();
835 
836         if let btree_map::Entry::Occupied(e) = handles.entry(handle) {
837             if e.get().inode == inode {
838                 // We don't need to close the file here because that will happen automatically when
839                 // the last `Arc` is dropped.
840                 e.remove();
841                 return Ok(());
842             }
843         }
844 
845         Err(ebadf())
846     }
847 
do_getattr(&self, inode: &InodeData) -> io::Result<(libc::stat64, Duration)>848     fn do_getattr(&self, inode: &InodeData) -> io::Result<(libc::stat64, Duration)> {
849         let st = stat(inode)?;
850 
851         Ok((st, self.cfg.attr_timeout))
852     }
853 
do_unlink(&self, parent: &InodeData, name: &CStr, flags: libc::c_int) -> io::Result<()>854     fn do_unlink(&self, parent: &InodeData, name: &CStr, flags: libc::c_int) -> io::Result<()> {
855         // Safe because this doesn't modify any memory and we check the return value.
856         syscall!(unsafe { libc::unlinkat(parent.as_raw_descriptor(), name.as_ptr(), flags) })?;
857         Ok(())
858     }
859 
do_fsync<F: AsRawDescriptor>(&self, file: &F, datasync: bool) -> io::Result<()>860     fn do_fsync<F: AsRawDescriptor>(&self, file: &F, datasync: bool) -> io::Result<()> {
861         // Safe because this doesn't modify any memory and we check the return value.
862         syscall!(unsafe {
863             if datasync {
864                 libc::fdatasync(file.as_raw_descriptor())
865             } else {
866                 libc::fsync(file.as_raw_descriptor())
867             }
868         })?;
869 
870         Ok(())
871     }
872 
873     // Changes the CWD to `self.proc`, runs `f`, and then changes the CWD back to the root
874     // directory. This effectively emulates an *at syscall starting at /proc, which is useful when
875     // there is no *at syscall available. Panics if any of the fchdir calls fail or if there is no
876     // root inode.
with_proc_chdir<F, T>(&self, f: F) -> T where F: FnOnce() -> T,877     fn with_proc_chdir<F, T>(&self, f: F) -> T
878     where
879         F: FnOnce() -> T,
880     {
881         let root = self.find_inode(ROOT_ID).expect("failed to find root inode");
882 
883         // Safe because this doesn't modify any memory and we check the return value. Since the
884         // fchdir should never fail we just use debug_asserts.
885         let proc_cwd = unsafe { libc::fchdir(self.proc.as_raw_descriptor()) };
886         debug_assert_eq!(
887             proc_cwd,
888             0,
889             "failed to fchdir to /proc: {}",
890             io::Error::last_os_error()
891         );
892 
893         let res = f();
894 
895         // Safe because this doesn't modify any memory and we check the return value. Since the
896         // fchdir should never fail we just use debug_asserts.
897         let root_cwd = unsafe { libc::fchdir(root.as_raw_descriptor()) };
898         debug_assert_eq!(
899             root_cwd,
900             0,
901             "failed to fchdir back to root directory: {}",
902             io::Error::last_os_error()
903         );
904 
905         res
906     }
907 
do_getxattr(&self, inode: &InodeData, name: &CStr, value: &mut [u8]) -> io::Result<usize>908     fn do_getxattr(&self, inode: &InodeData, name: &CStr, value: &mut [u8]) -> io::Result<usize> {
909         let res = if inode.filetype == FileType::Other {
910             // For non-regular files and directories, we cannot open the fd normally. Instead we
911             // emulate an _at syscall by changing the CWD to /proc, running the path based syscall,
912             // and then setting the CWD back to the root directory.
913             let path = CString::new(format!("self/fd/{}", inode.as_raw_descriptor()))
914                 .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
915 
916             // Safe because this will only modify `value` and we check the return value.
917             self.with_proc_chdir(|| unsafe {
918                 libc::getxattr(
919                     path.as_ptr(),
920                     name.as_ptr(),
921                     value.as_mut_ptr() as *mut libc::c_void,
922                     value.len() as libc::size_t,
923                 )
924             })
925         } else {
926             // For regular files and directories, we can just use fgetxattr. Safe because this will
927             // only write to `value` and we check the return value.
928             unsafe {
929                 libc::fgetxattr(
930                     inode.as_raw_descriptor(),
931                     name.as_ptr(),
932                     value.as_mut_ptr() as *mut libc::c_void,
933                     value.len() as libc::size_t,
934                 )
935             }
936         };
937 
938         if res < 0 {
939             Err(io::Error::last_os_error())
940         } else {
941             Ok(res as usize)
942         }
943     }
944 
get_encryption_policy_ex<R: io::Read>( &self, inode: Inode, handle: Handle, mut r: R, ) -> io::Result<IoctlReply>945     fn get_encryption_policy_ex<R: io::Read>(
946         &self,
947         inode: Inode,
948         handle: Handle,
949         mut r: R,
950     ) -> io::Result<IoctlReply> {
951         let data: Arc<dyn AsRawDescriptor> = if self.zero_message_open.load(Ordering::Relaxed) {
952             self.find_inode(inode)?
953         } else {
954             self.find_handle(handle, inode)?
955         };
956 
957         // Safe because this only has integer fields.
958         let mut arg = unsafe { MaybeUninit::<fscrypt_get_policy_ex_arg>::zeroed().assume_init() };
959         r.read_exact(arg.policy_size.as_mut_slice())?;
960 
961         let policy_size = cmp::min(arg.policy_size, size_of::<fscrypt_policy>() as u64);
962         arg.policy_size = policy_size;
963 
964         // Safe because the kernel will only write to `arg` and we check the return value.
965         let res =
966             unsafe { ioctl_with_mut_ptr(&*data, FS_IOC_GET_ENCRYPTION_POLICY_EX(), &mut arg) };
967         if res < 0 {
968             Ok(IoctlReply::Done(Err(io::Error::last_os_error())))
969         } else {
970             let len = size_of::<u64>() + arg.policy_size as usize;
971             Ok(IoctlReply::Done(Ok(arg.as_slice()[..len].to_vec())))
972         }
973     }
974 
get_fsxattr(&self, inode: Inode, handle: Handle) -> io::Result<IoctlReply>975     fn get_fsxattr(&self, inode: Inode, handle: Handle) -> io::Result<IoctlReply> {
976         let data: Arc<dyn AsRawDescriptor> = if self.zero_message_open.load(Ordering::Relaxed) {
977             self.find_inode(inode)?
978         } else {
979             self.find_handle(handle, inode)?
980         };
981 
982         let mut buf = MaybeUninit::<fsxattr>::zeroed();
983 
984         // Safe because the kernel will only write to `buf` and we check the return value.
985         let res = unsafe { ioctl_with_mut_ptr(&*data, FS_IOC_FSGETXATTR(), buf.as_mut_ptr()) };
986         if res < 0 {
987             Ok(IoctlReply::Done(Err(io::Error::last_os_error())))
988         } else {
989             // Safe because the kernel guarantees that the policy is now initialized.
990             let xattr = unsafe { buf.assume_init() };
991             Ok(IoctlReply::Done(Ok(xattr.as_slice().to_vec())))
992         }
993     }
994 
set_fsxattr<R: io::Read>( &self, #[cfg_attr(not(feature = "chromeos"), allow(unused_variables))] ctx: Context, inode: Inode, handle: Handle, r: R, ) -> io::Result<IoctlReply>995     fn set_fsxattr<R: io::Read>(
996         &self,
997         #[cfg_attr(not(feature = "chromeos"), allow(unused_variables))] ctx: Context,
998         inode: Inode,
999         handle: Handle,
1000         r: R,
1001     ) -> io::Result<IoctlReply> {
1002         let data: Arc<dyn AsRawDescriptor> = if self.zero_message_open.load(Ordering::Relaxed) {
1003             self.find_inode(inode)?
1004         } else {
1005             self.find_handle(handle, inode)?
1006         };
1007 
1008         let in_attr = fsxattr::from_reader(r)?;
1009 
1010         #[cfg(feature = "chromeos")]
1011         let st = stat(&*data)?;
1012 
1013         // Changing quota project ID requires CAP_FOWNER or being file owner.
1014         // Here we use privileged_quota_uids because we cannot perform a CAP_FOWNER check.
1015         #[cfg(feature = "chromeos")]
1016         if ctx.uid == st.st_uid || self.cfg.privileged_quota_uids.contains(&ctx.uid) {
1017             // Get the current fsxattr.
1018             let mut buf = MaybeUninit::<fsxattr>::zeroed();
1019             // Safe because the kernel will only write to `buf` and we check the return value.
1020             let res = unsafe { ioctl_with_mut_ptr(&*data, FS_IOC_FSGETXATTR(), buf.as_mut_ptr()) };
1021             if res < 0 {
1022                 return Ok(IoctlReply::Done(Err(io::Error::last_os_error())));
1023             }
1024             // Safe because the kernel guarantees that the policy is now initialized.
1025             let current_attr = unsafe { buf.assume_init() };
1026 
1027             // Project ID cannot be changed inside a user namespace.
1028             // Use UserDataAuth to avoid this restriction.
1029             if current_attr.fsx_projid != in_attr.fsx_projid {
1030                 let connection = self.dbus_connection.as_ref().unwrap().lock();
1031                 let proxy = connection.with_proxy(
1032                     "org.chromium.UserDataAuth",
1033                     "/org/chromium/UserDataAuth",
1034                     DEFAULT_DBUS_TIMEOUT,
1035                 );
1036                 let mut proto: SetMediaRWDataFileProjectIdRequest = Message::new();
1037                 proto.project_id = in_attr.fsx_projid;
1038                 // Safe because data is a valid file descriptor.
1039                 let fd = unsafe { dbus::arg::OwnedFd::new(base::clone_descriptor(&*data)?) };
1040                 match proxy.set_media_rwdata_file_project_id(fd, proto.write_to_bytes().unwrap()) {
1041                     Ok(r) => {
1042                         let r = protobuf::parse_from_bytes::<SetMediaRWDataFileProjectIdReply>(&r)
1043                             .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
1044                         if !r.success {
1045                             return Ok(IoctlReply::Done(Err(io::Error::from_raw_os_error(
1046                                 r.error,
1047                             ))));
1048                         }
1049                     }
1050                     Err(e) => {
1051                         return Err(io::Error::new(io::ErrorKind::Other, e));
1052                     }
1053                 };
1054             }
1055         }
1056 
1057         //  Safe because this doesn't modify any memory and we check the return value.
1058         let res = unsafe { ioctl_with_ptr(&*data, FS_IOC_FSSETXATTR(), &in_attr) };
1059         if res < 0 {
1060             Ok(IoctlReply::Done(Err(io::Error::last_os_error())))
1061         } else {
1062             Ok(IoctlReply::Done(Ok(Vec::new())))
1063         }
1064     }
1065 
get_flags(&self, inode: Inode, handle: Handle) -> io::Result<IoctlReply>1066     fn get_flags(&self, inode: Inode, handle: Handle) -> io::Result<IoctlReply> {
1067         let data: Arc<dyn AsRawDescriptor> = if self.zero_message_open.load(Ordering::Relaxed) {
1068             self.find_inode(inode)?
1069         } else {
1070             self.find_handle(handle, inode)?
1071         };
1072 
1073         // The ioctl encoding is a long but the parameter is actually an int.
1074         let mut flags: c_int = 0;
1075 
1076         // Safe because the kernel will only write to `flags` and we check the return value.
1077         let res = unsafe { ioctl_with_mut_ptr(&*data, FS_IOC_GETFLAGS(), &mut flags) };
1078         if res < 0 {
1079             Ok(IoctlReply::Done(Err(io::Error::last_os_error())))
1080         } else {
1081             Ok(IoctlReply::Done(Ok(flags.to_ne_bytes().to_vec())))
1082         }
1083     }
1084 
set_flags<R: io::Read>(&self, inode: Inode, handle: Handle, r: R) -> io::Result<IoctlReply>1085     fn set_flags<R: io::Read>(&self, inode: Inode, handle: Handle, r: R) -> io::Result<IoctlReply> {
1086         let data: Arc<dyn AsRawDescriptor> = if self.zero_message_open.load(Ordering::Relaxed) {
1087             self.find_inode(inode)?
1088         } else {
1089             self.find_handle(handle, inode)?
1090         };
1091 
1092         // The ioctl encoding is a long but the parameter is actually an int.
1093         let flags = c_int::from_reader(r)?;
1094 
1095         // Safe because this doesn't modify any memory and we check the return value.
1096         let res = unsafe { ioctl_with_ptr(&*data, FS_IOC_SETFLAGS(), &flags) };
1097         if res < 0 {
1098             Ok(IoctlReply::Done(Err(io::Error::last_os_error())))
1099         } else {
1100             Ok(IoctlReply::Done(Ok(Vec::new())))
1101         }
1102     }
1103 
enable_verity<R: io::Read>( &self, inode: Inode, handle: Handle, mut r: R, ) -> io::Result<IoctlReply>1104     fn enable_verity<R: io::Read>(
1105         &self,
1106         inode: Inode,
1107         handle: Handle,
1108         mut r: R,
1109     ) -> io::Result<IoctlReply> {
1110         let inode_data = self.find_inode(inode)?;
1111 
1112         // These match the return codes from `fsverity_ioctl_enable` in the kernel.
1113         match inode_data.filetype {
1114             FileType::Regular => {}
1115             FileType::Directory => return Err(io::Error::from_raw_os_error(libc::EISDIR)),
1116             FileType::Other => return Err(io::Error::from_raw_os_error(libc::EINVAL)),
1117         }
1118 
1119         {
1120             // We cannot enable verity while holding a writable fd so get a new one, if necessary.
1121             let mut file = inode_data.file.lock();
1122             let mut flags = file.1;
1123             match flags & libc::O_ACCMODE {
1124                 libc::O_WRONLY | libc::O_RDWR => {
1125                     flags &= !libc::O_ACCMODE;
1126                     flags |= libc::O_RDONLY;
1127 
1128                     // We need to get a read-only handle for this file.
1129                     let newfile = self.open_fd(file.0.as_raw_descriptor(), libc::O_RDONLY)?;
1130                     *file = (newfile, flags);
1131                 }
1132                 libc::O_RDONLY => {}
1133                 _ => panic!("Unexpected flags: {:#x}", flags),
1134             }
1135         }
1136 
1137         let data: Arc<dyn AsRawDescriptor> = if self.zero_message_open.load(Ordering::Relaxed) {
1138             inode_data
1139         } else {
1140             let data = self.find_handle(handle, inode)?;
1141 
1142             {
1143                 // We can't enable verity while holding a writable fd. We don't know whether the file
1144                 // was opened for writing so check it here. We don't expect this to be a frequent
1145                 // operation so the extra latency should be fine.
1146                 let mut file = data.file.lock();
1147                 let flags = FileFlags::from_file(&*file).map_err(io::Error::from)?;
1148                 match flags {
1149                     FileFlags::ReadWrite | FileFlags::Write => {
1150                         // We need to get a read-only handle for this file.
1151                         *file = self.open_fd(file.as_raw_descriptor(), libc::O_RDONLY)?;
1152                     }
1153                     FileFlags::Read => {}
1154                 }
1155             }
1156 
1157             data
1158         };
1159 
1160         let mut arg = fsverity_enable_arg::from_reader(&mut r)?;
1161 
1162         let mut salt;
1163         if arg.salt_size > 0 {
1164             if arg.salt_size > self.max_buffer_size() {
1165                 return Ok(IoctlReply::Done(Err(io::Error::from_raw_os_error(
1166                     libc::ENOMEM,
1167                 ))));
1168             }
1169             salt = vec![0; arg.salt_size as usize];
1170             r.read_exact(&mut salt)?;
1171             arg.salt_ptr = salt.as_ptr() as usize as u64;
1172         } else {
1173             arg.salt_ptr = 0;
1174         }
1175 
1176         let mut sig;
1177         if arg.sig_size > 0 {
1178             if arg.sig_size > self.max_buffer_size() {
1179                 return Ok(IoctlReply::Done(Err(io::Error::from_raw_os_error(
1180                     libc::ENOMEM,
1181                 ))));
1182             }
1183             sig = vec![0; arg.sig_size as usize];
1184             r.read_exact(&mut sig)?;
1185             arg.sig_ptr = sig.as_ptr() as usize as u64;
1186         } else {
1187             arg.sig_ptr = 0;
1188         }
1189 
1190         // Safe because this doesn't modify any memory and we check the return value.
1191         let res = unsafe { ioctl_with_ptr(&*data, FS_IOC_ENABLE_VERITY(), &arg) };
1192         if res < 0 {
1193             Ok(IoctlReply::Done(Err(io::Error::last_os_error())))
1194         } else {
1195             Ok(IoctlReply::Done(Ok(Vec::new())))
1196         }
1197     }
1198 
measure_verity<R: io::Read>( &self, inode: Inode, handle: Handle, r: R, out_size: u32, ) -> io::Result<IoctlReply>1199     fn measure_verity<R: io::Read>(
1200         &self,
1201         inode: Inode,
1202         handle: Handle,
1203         r: R,
1204         out_size: u32,
1205     ) -> io::Result<IoctlReply> {
1206         let data: Arc<dyn AsRawDescriptor> = if self.zero_message_open.load(Ordering::Relaxed) {
1207             self.find_inode(inode)?
1208         } else {
1209             self.find_handle(handle, inode)?
1210         };
1211 
1212         let digest = fsverity_digest::from_reader(r)?;
1213 
1214         // Taken from fs/verity/fsverity_private.h.
1215         const FS_VERITY_MAX_DIGEST_SIZE: u16 = 64;
1216 
1217         // This digest size is what the fsverity command line utility uses.
1218         const DIGEST_SIZE: u16 = FS_VERITY_MAX_DIGEST_SIZE * 2 + 1;
1219         const BUFLEN: usize = size_of::<fsverity_digest>() + DIGEST_SIZE as usize;
1220         const ROUNDED_LEN: usize =
1221             (BUFLEN + size_of::<fsverity_digest>() - 1) / size_of::<fsverity_digest>();
1222 
1223         // Make sure we get a properly aligned allocation.
1224         let mut buf = [MaybeUninit::<fsverity_digest>::uninit(); ROUNDED_LEN];
1225 
1226         // Safe because we are only writing data and not reading uninitialized memory.
1227         unsafe {
1228             // TODO: Replace with `MaybeUninit::slice_as_mut_ptr` once it is stabilized.
1229             addr_of_mut!((*(buf.as_mut_ptr() as *mut fsverity_digest)).digest_size)
1230                 .write(DIGEST_SIZE)
1231         };
1232 
1233         // Safe because this will only modify `buf` and we check the return value.
1234         let res = unsafe { ioctl_with_mut_ptr(&*data, FS_IOC_MEASURE_VERITY(), buf.as_mut_ptr()) };
1235         if res < 0 {
1236             Ok(IoctlReply::Done(Err(io::Error::last_os_error())))
1237         } else {
1238             // Safe because this value was initialized by us already and then overwritten by the
1239             // kernel.
1240             // TODO: Replace with `MaybeUninit::slice_as_ptr` once it is stabilized.
1241             let digest_size =
1242                 unsafe { addr_of!((*(buf.as_ptr() as *const fsverity_digest)).digest_size).read() };
1243             let outlen = size_of::<fsverity_digest>() as u32 + u32::from(digest_size);
1244 
1245             // The kernel guarantees this but it doesn't hurt to be paranoid.
1246             debug_assert!(outlen <= (ROUNDED_LEN * size_of::<fsverity_digest>()) as u32);
1247             if digest.digest_size < digest_size || out_size < outlen {
1248                 return Ok(IoctlReply::Done(Err(io::Error::from_raw_os_error(
1249                     libc::EOVERFLOW,
1250                 ))));
1251             }
1252 
1253             // Safe because any bit pattern is valid for `MaybeUninit<u8>` and `fsverity_digest`
1254             // doesn't contain any references.
1255             let buf: [MaybeUninit<u8>; ROUNDED_LEN * size_of::<fsverity_digest>()] =
1256                 unsafe { mem::transmute(buf) };
1257 
1258             // Casting to `*const [u8]` is safe because the kernel guarantees that the first
1259             // `outlen` bytes of `buf` are initialized and `MaybeUninit<u8>` is guaranteed to have
1260             // the same layout as `u8`.
1261             // TODO: Replace with `MaybeUninit::slice_assume_init_ref` once it is stabilized.
1262             let buf =
1263                 unsafe { &*(&buf[..outlen as usize] as *const [MaybeUninit<u8>] as *const [u8]) };
1264             Ok(IoctlReply::Done(Ok(buf.to_vec())))
1265         }
1266     }
1267 }
1268 
forget_one( inodes: &mut MultikeyBTreeMap<Inode, InodeAltKey, Arc<InodeData>>, inode: Inode, count: u64, )1269 fn forget_one(
1270     inodes: &mut MultikeyBTreeMap<Inode, InodeAltKey, Arc<InodeData>>,
1271     inode: Inode,
1272     count: u64,
1273 ) {
1274     if let Some(data) = inodes.get(&inode) {
1275         // Acquiring the write lock on the inode map prevents new lookups from incrementing the
1276         // refcount but there is the possibility that a previous lookup already acquired a
1277         // reference to the inode data and is in the process of updating the refcount so we need
1278         // to loop here until we can decrement successfully.
1279         loop {
1280             let refcount = data.refcount.load(Ordering::Relaxed);
1281 
1282             // Saturating sub because it doesn't make sense for a refcount to go below zero and
1283             // we don't want misbehaving clients to cause integer overflow.
1284             let new_count = refcount.saturating_sub(count);
1285 
1286             // Synchronizes with the acquire load in `do_lookup`.
1287             if data
1288                 .refcount
1289                 .compare_exchange_weak(refcount, new_count, Ordering::Release, Ordering::Relaxed)
1290                 .is_ok()
1291             {
1292                 if new_count == 0 {
1293                     // We just removed the last refcount for this inode. There's no need for an
1294                     // acquire fence here because we hold a write lock on the inode map and any
1295                     // thread that is waiting to do a forget on the same inode will have to wait
1296                     // until we release the lock. So there's is no other release store for us to
1297                     // synchronize with before deleting the entry.
1298                     inodes.remove(&inode);
1299                 }
1300                 break;
1301             }
1302         }
1303     }
1304 }
1305 
1306 // Strips any `user.virtiofs.` prefix from `buf`. If buf contains one or more nul-bytes, each
1307 // nul-byte-separated slice is treated as a C string and the prefix is stripped from each one.
strip_xattr_prefix(buf: &mut Vec<u8>)1308 fn strip_xattr_prefix(buf: &mut Vec<u8>) {
1309     fn next_cstr(b: &[u8], start: usize) -> Option<&[u8]> {
1310         if start >= b.len() {
1311             return None;
1312         }
1313 
1314         let end = b[start..]
1315             .iter()
1316             .position(|&c| c == b'\0')
1317             .map(|p| start + p + 1)
1318             .unwrap_or(b.len());
1319 
1320         Some(&b[start..end])
1321     }
1322 
1323     let mut pos = 0;
1324     while let Some(name) = next_cstr(buf, pos) {
1325         if !name.starts_with(USER_VIRTIOFS_XATTR) {
1326             pos += name.len();
1327             continue;
1328         }
1329 
1330         let newlen = name.len() - USER_VIRTIOFS_XATTR.len();
1331         buf.drain(pos..pos + USER_VIRTIOFS_XATTR.len());
1332         pos += newlen;
1333     }
1334 }
1335 
1336 impl FileSystem for PassthroughFs {
1337     type Inode = Inode;
1338     type Handle = Handle;
1339     type DirIter = ReadDir<Box<[u8]>>;
1340 
init(&self, capable: FsOptions) -> io::Result<FsOptions>1341     fn init(&self, capable: FsOptions) -> io::Result<FsOptions> {
1342         // Safe because this is a constant value and a valid C string.
1343         let root = unsafe { CStr::from_bytes_with_nul_unchecked(ROOT_CSTR) };
1344 
1345         let flags = libc::O_DIRECTORY | libc::O_NOFOLLOW | libc::O_CLOEXEC;
1346         // Safe because this doesn't modify any memory and we check the return value.
1347         let raw_descriptor = unsafe { libc::openat64(libc::AT_FDCWD, root.as_ptr(), flags) };
1348         if raw_descriptor < 0 {
1349             return Err(io::Error::last_os_error());
1350         }
1351 
1352         // Safe because we just opened this descriptor above.
1353         let f = unsafe { File::from_raw_descriptor(raw_descriptor) };
1354 
1355         let st = stat(&f)?;
1356 
1357         // Safe because this doesn't modify any memory and there is no need to check the return
1358         // value because this system call always succeeds. We need to clear the umask here because
1359         // we want the client to be able to set all the bits in the mode.
1360         unsafe { libc::umask(0o000) };
1361 
1362         let mut inodes = self.inodes.lock();
1363 
1364         // Not sure why the root inode gets a refcount of 2 but that's what libfuse does.
1365         inodes.insert(
1366             ROOT_ID,
1367             InodeAltKey {
1368                 ino: st.st_ino,
1369                 dev: st.st_dev,
1370             },
1371             Arc::new(InodeData {
1372                 inode: ROOT_ID,
1373                 file: Mutex::new((f, flags)),
1374                 refcount: AtomicU64::new(2),
1375                 filetype: st.st_mode.into(),
1376             }),
1377         );
1378 
1379         let mut opts = FsOptions::DO_READDIRPLUS
1380             | FsOptions::READDIRPLUS_AUTO
1381             | FsOptions::EXPORT_SUPPORT
1382             | FsOptions::DONT_MASK;
1383         if self.cfg.posix_acl {
1384             opts |= FsOptions::POSIX_ACL;
1385         }
1386         if self.cfg.writeback && capable.contains(FsOptions::WRITEBACK_CACHE) {
1387             opts |= FsOptions::WRITEBACK_CACHE;
1388             self.writeback.store(true, Ordering::Relaxed);
1389         }
1390         if self.cfg.cache_policy == CachePolicy::Always {
1391             if capable.contains(FsOptions::ZERO_MESSAGE_OPEN) {
1392                 opts |= FsOptions::ZERO_MESSAGE_OPEN;
1393                 self.zero_message_open.store(true, Ordering::Relaxed);
1394             }
1395             if capable.contains(FsOptions::ZERO_MESSAGE_OPENDIR) {
1396                 opts |= FsOptions::ZERO_MESSAGE_OPENDIR;
1397                 self.zero_message_opendir.store(true, Ordering::Relaxed);
1398             }
1399         }
1400         Ok(opts)
1401     }
1402 
destroy(&self)1403     fn destroy(&self) {
1404         self.handles.lock().clear();
1405         self.inodes.lock().clear();
1406     }
1407 
statfs(&self, _ctx: Context, inode: Inode) -> io::Result<libc::statvfs64>1408     fn statfs(&self, _ctx: Context, inode: Inode) -> io::Result<libc::statvfs64> {
1409         let data = self.find_inode(inode)?;
1410 
1411         let mut out = MaybeUninit::<libc::statvfs64>::zeroed();
1412 
1413         // Safe because this will only modify `out` and we check the return value.
1414         syscall!(unsafe { libc::fstatvfs64(data.as_raw_descriptor(), out.as_mut_ptr()) })?;
1415 
1416         // Safe because the kernel guarantees that `out` has been initialized.
1417         Ok(unsafe { out.assume_init() })
1418     }
1419 
lookup(&self, _ctx: Context, parent: Inode, name: &CStr) -> io::Result<Entry>1420     fn lookup(&self, _ctx: Context, parent: Inode, name: &CStr) -> io::Result<Entry> {
1421         let data = self.find_inode(parent)?;
1422         self.do_lookup(&data, name).or_else(|e| {
1423             if self.cfg.ascii_casefold {
1424                 self.ascii_casefold_lookup(&data, name.to_bytes())
1425             } else {
1426                 Err(e)
1427             }
1428         })
1429     }
1430 
forget(&self, _ctx: Context, inode: Inode, count: u64)1431     fn forget(&self, _ctx: Context, inode: Inode, count: u64) {
1432         let mut inodes = self.inodes.lock();
1433 
1434         forget_one(&mut inodes, inode, count)
1435     }
1436 
batch_forget(&self, _ctx: Context, requests: Vec<(Inode, u64)>)1437     fn batch_forget(&self, _ctx: Context, requests: Vec<(Inode, u64)>) {
1438         let mut inodes = self.inodes.lock();
1439 
1440         for (inode, count) in requests {
1441             forget_one(&mut inodes, inode, count)
1442         }
1443     }
1444 
opendir( &self, _ctx: Context, inode: Inode, flags: u32, ) -> io::Result<(Option<Handle>, OpenOptions)>1445     fn opendir(
1446         &self,
1447         _ctx: Context,
1448         inode: Inode,
1449         flags: u32,
1450     ) -> io::Result<(Option<Handle>, OpenOptions)> {
1451         if self.zero_message_opendir.load(Ordering::Relaxed) {
1452             Err(io::Error::from_raw_os_error(libc::ENOSYS))
1453         } else {
1454             self.do_open(inode, flags | (libc::O_DIRECTORY as u32))
1455         }
1456     }
1457 
releasedir( &self, _ctx: Context, inode: Inode, _flags: u32, handle: Handle, ) -> io::Result<()>1458     fn releasedir(
1459         &self,
1460         _ctx: Context,
1461         inode: Inode,
1462         _flags: u32,
1463         handle: Handle,
1464     ) -> io::Result<()> {
1465         if self.zero_message_opendir.load(Ordering::Relaxed) {
1466             Ok(())
1467         } else {
1468             self.do_release(inode, handle)
1469         }
1470     }
1471 
mkdir( &self, ctx: Context, parent: Inode, name: &CStr, mode: u32, umask: u32, ) -> io::Result<Entry>1472     fn mkdir(
1473         &self,
1474         ctx: Context,
1475         parent: Inode,
1476         name: &CStr,
1477         mode: u32,
1478         umask: u32,
1479     ) -> io::Result<Entry> {
1480         let data = self.find_inode(parent)?;
1481 
1482         let (_uid, _gid) = set_creds(ctx.uid, ctx.gid)?;
1483         {
1484             let _scoped_umask = ScopedUmask::new(umask);
1485 
1486             // Safe because this doesn't modify any memory and we check the return value.
1487             syscall!(unsafe { libc::mkdirat(data.as_raw_descriptor(), name.as_ptr(), mode) })?;
1488         }
1489 
1490         self.do_lookup(&data, name)
1491     }
1492 
rmdir(&self, _ctx: Context, parent: Inode, name: &CStr) -> io::Result<()>1493     fn rmdir(&self, _ctx: Context, parent: Inode, name: &CStr) -> io::Result<()> {
1494         let data = self.find_inode(parent)?;
1495         self.do_unlink(&data, name, libc::AT_REMOVEDIR)
1496     }
1497 
readdir( &self, _ctx: Context, inode: Inode, handle: Handle, size: u32, offset: u64, ) -> io::Result<Self::DirIter>1498     fn readdir(
1499         &self,
1500         _ctx: Context,
1501         inode: Inode,
1502         handle: Handle,
1503         size: u32,
1504         offset: u64,
1505     ) -> io::Result<Self::DirIter> {
1506         let buf = vec![0; size as usize].into_boxed_slice();
1507 
1508         if self.zero_message_opendir.load(Ordering::Relaxed) {
1509             let data = self.find_inode(inode)?;
1510             ReadDir::new(&*data, offset as libc::off64_t, buf)
1511         } else {
1512             let data = self.find_handle(handle, inode)?;
1513 
1514             let dir = data.file.lock();
1515 
1516             ReadDir::new(&*dir, offset as libc::off64_t, buf)
1517         }
1518     }
1519 
open( &self, _ctx: Context, inode: Inode, flags: u32, ) -> io::Result<(Option<Handle>, OpenOptions)>1520     fn open(
1521         &self,
1522         _ctx: Context,
1523         inode: Inode,
1524         flags: u32,
1525     ) -> io::Result<(Option<Handle>, OpenOptions)> {
1526         if self.zero_message_open.load(Ordering::Relaxed) {
1527             Err(io::Error::from_raw_os_error(libc::ENOSYS))
1528         } else {
1529             self.do_open(inode, flags)
1530         }
1531     }
1532 
release( &self, _ctx: Context, inode: Inode, _flags: u32, handle: Handle, _flush: bool, _flock_release: bool, _lock_owner: Option<u64>, ) -> io::Result<()>1533     fn release(
1534         &self,
1535         _ctx: Context,
1536         inode: Inode,
1537         _flags: u32,
1538         handle: Handle,
1539         _flush: bool,
1540         _flock_release: bool,
1541         _lock_owner: Option<u64>,
1542     ) -> io::Result<()> {
1543         if self.zero_message_open.load(Ordering::Relaxed) {
1544             Ok(())
1545         } else {
1546             self.do_release(inode, handle)
1547         }
1548     }
1549 
chromeos_tmpfile( &self, ctx: Context, parent: Self::Inode, mode: u32, umask: u32, ) -> io::Result<Entry>1550     fn chromeos_tmpfile(
1551         &self,
1552         ctx: Context,
1553         parent: Self::Inode,
1554         mode: u32,
1555         umask: u32,
1556     ) -> io::Result<Entry> {
1557         let data = self.find_inode(parent)?;
1558 
1559         let (_uid, _gid) = set_creds(ctx.uid, ctx.gid)?;
1560 
1561         let tmpflags = libc::O_RDWR | libc::O_TMPFILE | libc::O_CLOEXEC | libc::O_NOFOLLOW;
1562 
1563         // Safe because this is a valid c string.
1564         let current_dir = unsafe { CStr::from_bytes_with_nul_unchecked(b".\0") };
1565 
1566         let fd = {
1567             let _scoped_umask = ScopedUmask::new(umask);
1568 
1569             // Safe because this doesn't modify any memory and we check the return value.
1570             syscall!(unsafe {
1571                 libc::openat64(
1572                     data.as_raw_descriptor(),
1573                     current_dir.as_ptr(),
1574                     tmpflags,
1575                     mode,
1576                 )
1577             })?
1578         };
1579 
1580         // Safe because we just opened this fd.
1581         let tmpfile = unsafe { File::from_raw_descriptor(fd) };
1582 
1583         let st = stat(&tmpfile)?;
1584         Ok(self.add_entry(tmpfile, st, tmpflags))
1585     }
1586 
create( &self, ctx: Context, parent: Inode, name: &CStr, mode: u32, flags: u32, umask: u32, ) -> io::Result<(Entry, Option<Handle>, OpenOptions)>1587     fn create(
1588         &self,
1589         ctx: Context,
1590         parent: Inode,
1591         name: &CStr,
1592         mode: u32,
1593         flags: u32,
1594         umask: u32,
1595     ) -> io::Result<(Entry, Option<Handle>, OpenOptions)> {
1596         let data = self.find_inode(parent)?;
1597 
1598         let (_uid, _gid) = set_creds(ctx.uid, ctx.gid)?;
1599 
1600         let create_flags =
1601             (flags as i32 | libc::O_CREAT | libc::O_CLOEXEC | libc::O_NOFOLLOW) & !libc::O_DIRECT;
1602 
1603         let fd = {
1604             let _scoped_umask = ScopedUmask::new(umask);
1605 
1606             // Safe because this doesn't modify any memory and we check the return value. We don't
1607             // really check `flags` because if the kernel can't handle poorly specified flags then
1608             // we have much bigger problems.
1609             syscall!(unsafe {
1610                 libc::openat64(data.as_raw_descriptor(), name.as_ptr(), create_flags, mode)
1611             })?
1612         };
1613 
1614         // Safe because we just opened this fd.
1615         let file = unsafe { File::from_raw_descriptor(fd) };
1616 
1617         let st = stat(&file)?;
1618         let entry = self.add_entry(file, st, create_flags);
1619 
1620         let (handle, opts) = if self.zero_message_open.load(Ordering::Relaxed) {
1621             (None, OpenOptions::KEEP_CACHE)
1622         } else {
1623             self.do_open(
1624                 entry.inode,
1625                 flags & !((libc::O_CREAT | libc::O_EXCL | libc::O_NOCTTY) as u32),
1626             )
1627             .map_err(|e| {
1628                 // Don't leak the entry.
1629                 self.forget(ctx, entry.inode, 1);
1630                 e
1631             })?
1632         };
1633 
1634         Ok((entry, handle, opts))
1635     }
1636 
unlink(&self, _ctx: Context, parent: Inode, name: &CStr) -> io::Result<()>1637     fn unlink(&self, _ctx: Context, parent: Inode, name: &CStr) -> io::Result<()> {
1638         let data = self.find_inode(parent)?;
1639         self.do_unlink(&data, name, 0)
1640     }
1641 
read<W: io::Write + ZeroCopyWriter>( &self, _ctx: Context, inode: Inode, handle: Handle, mut w: W, size: u32, offset: u64, _lock_owner: Option<u64>, _flags: u32, ) -> io::Result<usize>1642     fn read<W: io::Write + ZeroCopyWriter>(
1643         &self,
1644         _ctx: Context,
1645         inode: Inode,
1646         handle: Handle,
1647         mut w: W,
1648         size: u32,
1649         offset: u64,
1650         _lock_owner: Option<u64>,
1651         _flags: u32,
1652     ) -> io::Result<usize> {
1653         if self.zero_message_open.load(Ordering::Relaxed) {
1654             let data = self.find_inode(inode)?;
1655 
1656             let mut file = data.file.lock();
1657             let mut flags = file.1;
1658             match flags & libc::O_ACCMODE {
1659                 libc::O_WRONLY => {
1660                     flags &= !libc::O_WRONLY;
1661                     flags |= libc::O_RDWR;
1662 
1663                     // We need to get a readable handle for this file.
1664                     let newfile = self.open_fd(file.0.as_raw_descriptor(), libc::O_RDWR)?;
1665                     *file = (newfile, flags);
1666                 }
1667                 libc::O_RDONLY | libc::O_RDWR => {}
1668                 _ => panic!("Unexpected flags: {:#x}", flags),
1669             }
1670 
1671             w.write_from(&mut file.0, size as usize, offset)
1672         } else {
1673             let data = self.find_handle(handle, inode)?;
1674 
1675             let mut f = data.file.lock();
1676             w.write_from(&mut f, size as usize, offset)
1677         }
1678     }
1679 
write<R: io::Read + ZeroCopyReader>( &self, _ctx: Context, inode: Inode, handle: Handle, mut r: R, size: u32, offset: u64, _lock_owner: Option<u64>, _delayed_write: bool, flags: u32, ) -> io::Result<usize>1680     fn write<R: io::Read + ZeroCopyReader>(
1681         &self,
1682         _ctx: Context,
1683         inode: Inode,
1684         handle: Handle,
1685         mut r: R,
1686         size: u32,
1687         offset: u64,
1688         _lock_owner: Option<u64>,
1689         _delayed_write: bool,
1690         flags: u32,
1691     ) -> io::Result<usize> {
1692         // When the WRITE_KILL_PRIV flag is set, drop CAP_FSETID so that the kernel will
1693         // automatically clear the setuid and setgid bits for us.
1694         let _fsetid = if flags & WRITE_KILL_PRIV != 0 {
1695             Some(drop_cap_fsetid()?)
1696         } else {
1697             None
1698         };
1699 
1700         if self.zero_message_open.load(Ordering::Relaxed) {
1701             let data = self.find_inode(inode)?;
1702 
1703             let mut file = data.file.lock();
1704             let mut flags = file.1;
1705             match flags & libc::O_ACCMODE {
1706                 libc::O_RDONLY => {
1707                     flags &= !libc::O_RDONLY;
1708                     flags |= libc::O_RDWR;
1709 
1710                     // We need to get a writable handle for this file.
1711                     let newfile = self.open_fd(file.0.as_raw_descriptor(), libc::O_RDWR)?;
1712                     *file = (newfile, flags);
1713                 }
1714                 libc::O_WRONLY | libc::O_RDWR => {}
1715                 _ => panic!("Unexpected flags: {:#x}", flags),
1716             }
1717 
1718             r.read_to(&mut file.0, size as usize, offset)
1719         } else {
1720             let data = self.find_handle(handle, inode)?;
1721 
1722             let mut f = data.file.lock();
1723             r.read_to(&mut f, size as usize, offset)
1724         }
1725     }
1726 
getattr( &self, _ctx: Context, inode: Inode, _handle: Option<Handle>, ) -> io::Result<(libc::stat64, Duration)>1727     fn getattr(
1728         &self,
1729         _ctx: Context,
1730         inode: Inode,
1731         _handle: Option<Handle>,
1732     ) -> io::Result<(libc::stat64, Duration)> {
1733         let data = self.find_inode(inode)?;
1734         self.do_getattr(&data)
1735     }
1736 
setattr( &self, _ctx: Context, inode: Inode, attr: libc::stat64, handle: Option<Handle>, valid: SetattrValid, ) -> io::Result<(libc::stat64, Duration)>1737     fn setattr(
1738         &self,
1739         _ctx: Context,
1740         inode: Inode,
1741         attr: libc::stat64,
1742         handle: Option<Handle>,
1743         valid: SetattrValid,
1744     ) -> io::Result<(libc::stat64, Duration)> {
1745         let inode_data = self.find_inode(inode)?;
1746 
1747         enum Data {
1748             Handle(Arc<HandleData>, RawDescriptor),
1749             ProcPath(CString),
1750         }
1751 
1752         // If we have a handle then use it otherwise get a new fd from the inode.
1753         let data = if let Some(handle) = handle.filter(|&h| h != 0) {
1754             let hd = self.find_handle(handle, inode)?;
1755 
1756             let fd = hd.file.lock().as_raw_descriptor();
1757             Data::Handle(hd, fd)
1758         } else {
1759             let pathname = CString::new(format!("self/fd/{}", inode_data.as_raw_descriptor()))
1760                 .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
1761             Data::ProcPath(pathname)
1762         };
1763 
1764         if valid.contains(SetattrValid::MODE) {
1765             // Safe because this doesn't modify any memory and we check the return value.
1766             syscall!(unsafe {
1767                 match data {
1768                     Data::Handle(_, fd) => libc::fchmod(fd, attr.st_mode),
1769                     Data::ProcPath(ref p) => {
1770                         libc::fchmodat(self.proc.as_raw_descriptor(), p.as_ptr(), attr.st_mode, 0)
1771                     }
1772                 }
1773             })?;
1774         }
1775 
1776         if valid.intersects(SetattrValid::UID | SetattrValid::GID) {
1777             let uid = if valid.contains(SetattrValid::UID) {
1778                 attr.st_uid
1779             } else {
1780                 // Cannot use -1 here because these are unsigned values.
1781                 ::std::u32::MAX
1782             };
1783             let gid = if valid.contains(SetattrValid::GID) {
1784                 attr.st_gid
1785             } else {
1786                 // Cannot use -1 here because these are unsigned values.
1787                 ::std::u32::MAX
1788             };
1789 
1790             // Safe because this is a constant value and a valid C string.
1791             let empty = unsafe { CStr::from_bytes_with_nul_unchecked(EMPTY_CSTR) };
1792 
1793             // Safe because this doesn't modify any memory and we check the return value.
1794             syscall!(unsafe {
1795                 libc::fchownat(
1796                     inode_data.as_raw_descriptor(),
1797                     empty.as_ptr(),
1798                     uid,
1799                     gid,
1800                     libc::AT_EMPTY_PATH | libc::AT_SYMLINK_NOFOLLOW,
1801                 )
1802             })?;
1803         }
1804 
1805         if valid.contains(SetattrValid::SIZE) {
1806             // Safe because this doesn't modify any memory and we check the return value.
1807             syscall!(match data {
1808                 Data::Handle(_, fd) => unsafe { libc::ftruncate64(fd, attr.st_size) },
1809                 _ => {
1810                     // There is no `ftruncateat` so we need to get a new fd and truncate it.
1811                     let f = self.open_inode(&inode_data, libc::O_NONBLOCK | libc::O_RDWR)?;
1812                     unsafe { libc::ftruncate64(f.as_raw_descriptor(), attr.st_size) }
1813                 }
1814             })?;
1815         }
1816 
1817         if valid.intersects(SetattrValid::ATIME | SetattrValid::MTIME) {
1818             let mut tvs = [
1819                 libc::timespec {
1820                     tv_sec: 0,
1821                     tv_nsec: libc::UTIME_OMIT,
1822                 },
1823                 libc::timespec {
1824                     tv_sec: 0,
1825                     tv_nsec: libc::UTIME_OMIT,
1826                 },
1827             ];
1828 
1829             if valid.contains(SetattrValid::ATIME_NOW) {
1830                 tvs[0].tv_nsec = libc::UTIME_NOW;
1831             } else if valid.contains(SetattrValid::ATIME) {
1832                 tvs[0].tv_sec = attr.st_atime;
1833                 tvs[0].tv_nsec = attr.st_atime_nsec;
1834             }
1835 
1836             if valid.contains(SetattrValid::MTIME_NOW) {
1837                 tvs[1].tv_nsec = libc::UTIME_NOW;
1838             } else if valid.contains(SetattrValid::MTIME) {
1839                 tvs[1].tv_sec = attr.st_mtime;
1840                 tvs[1].tv_nsec = attr.st_mtime_nsec;
1841             }
1842 
1843             // Safe because this doesn't modify any memory and we check the return value.
1844             syscall!(unsafe {
1845                 match data {
1846                     Data::Handle(_, fd) => libc::futimens(fd, tvs.as_ptr()),
1847                     Data::ProcPath(ref p) => {
1848                         libc::utimensat(self.proc.as_raw_descriptor(), p.as_ptr(), tvs.as_ptr(), 0)
1849                     }
1850                 }
1851             })?;
1852         }
1853 
1854         self.do_getattr(&inode_data)
1855     }
1856 
rename( &self, _ctx: Context, olddir: Inode, oldname: &CStr, newdir: Inode, newname: &CStr, flags: u32, ) -> io::Result<()>1857     fn rename(
1858         &self,
1859         _ctx: Context,
1860         olddir: Inode,
1861         oldname: &CStr,
1862         newdir: Inode,
1863         newname: &CStr,
1864         flags: u32,
1865     ) -> io::Result<()> {
1866         let old_inode = self.find_inode(olddir)?;
1867         let new_inode = self.find_inode(newdir)?;
1868 
1869         // Safe because this doesn't modify any memory and we check the return value.
1870         // TODO: Switch to libc::renameat2 once https://github.com/rust-lang/libc/pull/1508 lands
1871         // and we have glibc 2.28.
1872         syscall!(unsafe {
1873             libc::syscall(
1874                 libc::SYS_renameat2,
1875                 old_inode.as_raw_descriptor(),
1876                 oldname.as_ptr(),
1877                 new_inode.as_raw_descriptor(),
1878                 newname.as_ptr(),
1879                 flags,
1880             )
1881         })?;
1882         Ok(())
1883     }
1884 
mknod( &self, ctx: Context, parent: Inode, name: &CStr, mode: u32, rdev: u32, umask: u32, ) -> io::Result<Entry>1885     fn mknod(
1886         &self,
1887         ctx: Context,
1888         parent: Inode,
1889         name: &CStr,
1890         mode: u32,
1891         rdev: u32,
1892         umask: u32,
1893     ) -> io::Result<Entry> {
1894         let data = self.find_inode(parent)?;
1895 
1896         let (_uid, _gid) = set_creds(ctx.uid, ctx.gid)?;
1897 
1898         {
1899             let _scoped_umask = ScopedUmask::new(umask);
1900 
1901             // Safe because this doesn't modify any memory and we check the return value.
1902             syscall!(unsafe {
1903                 libc::mknodat(
1904                     data.as_raw_descriptor(),
1905                     name.as_ptr(),
1906                     mode as libc::mode_t,
1907                     rdev as libc::dev_t,
1908                 )
1909             })?;
1910         }
1911 
1912         self.do_lookup(&data, name)
1913     }
1914 
link( &self, _ctx: Context, inode: Inode, newparent: Inode, newname: &CStr, ) -> io::Result<Entry>1915     fn link(
1916         &self,
1917         _ctx: Context,
1918         inode: Inode,
1919         newparent: Inode,
1920         newname: &CStr,
1921     ) -> io::Result<Entry> {
1922         let data = self.find_inode(inode)?;
1923         let new_inode = self.find_inode(newparent)?;
1924 
1925         let path = CString::new(format!("self/fd/{}", data.as_raw_descriptor()))
1926             .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
1927 
1928         // Safe because this doesn't modify any memory and we check the return value.
1929         syscall!(unsafe {
1930             libc::linkat(
1931                 self.proc.as_raw_descriptor(),
1932                 path.as_ptr(),
1933                 new_inode.as_raw_descriptor(),
1934                 newname.as_ptr(),
1935                 libc::AT_SYMLINK_FOLLOW,
1936             )
1937         })?;
1938 
1939         self.do_lookup(&new_inode, newname)
1940     }
1941 
symlink( &self, ctx: Context, linkname: &CStr, parent: Inode, name: &CStr, ) -> io::Result<Entry>1942     fn symlink(
1943         &self,
1944         ctx: Context,
1945         linkname: &CStr,
1946         parent: Inode,
1947         name: &CStr,
1948     ) -> io::Result<Entry> {
1949         let data = self.find_inode(parent)?;
1950 
1951         let (_uid, _gid) = set_creds(ctx.uid, ctx.gid)?;
1952 
1953         // Safe because this doesn't modify any memory and we check the return value.
1954         syscall!(unsafe {
1955             libc::symlinkat(linkname.as_ptr(), data.as_raw_descriptor(), name.as_ptr())
1956         })?;
1957 
1958         self.do_lookup(&data, name)
1959     }
1960 
readlink(&self, _ctx: Context, inode: Inode) -> io::Result<Vec<u8>>1961     fn readlink(&self, _ctx: Context, inode: Inode) -> io::Result<Vec<u8>> {
1962         let data = self.find_inode(inode)?;
1963 
1964         let mut buf = vec![0; libc::PATH_MAX as usize];
1965 
1966         // Safe because this is a constant value and a valid C string.
1967         let empty = unsafe { CStr::from_bytes_with_nul_unchecked(EMPTY_CSTR) };
1968 
1969         // Safe because this will only modify the contents of `buf` and we check the return value.
1970         let res = syscall!(unsafe {
1971             libc::readlinkat(
1972                 data.as_raw_descriptor(),
1973                 empty.as_ptr(),
1974                 buf.as_mut_ptr() as *mut libc::c_char,
1975                 buf.len(),
1976             )
1977         })?;
1978 
1979         buf.resize(res as usize, 0);
1980         Ok(buf)
1981     }
1982 
flush( &self, _ctx: Context, inode: Inode, handle: Handle, _lock_owner: u64, ) -> io::Result<()>1983     fn flush(
1984         &self,
1985         _ctx: Context,
1986         inode: Inode,
1987         handle: Handle,
1988         _lock_owner: u64,
1989     ) -> io::Result<()> {
1990         let data: Arc<dyn AsRawDescriptor> = if self.zero_message_open.load(Ordering::Relaxed) {
1991             self.find_inode(inode)?
1992         } else {
1993             self.find_handle(handle, inode)?
1994         };
1995 
1996         // Since this method is called whenever an fd is closed in the client, we can emulate that
1997         // behavior by doing the same thing (dup-ing the fd and then immediately closing it). Safe
1998         // because this doesn't modify any memory and we check the return values.
1999         unsafe {
2000             let newfd = syscall!(libc::fcntl(
2001                 data.as_raw_descriptor(),
2002                 libc::F_DUPFD_CLOEXEC,
2003                 0
2004             ))?;
2005 
2006             syscall!(libc::close(newfd))?;
2007         }
2008         Ok(())
2009     }
2010 
fsync(&self, _ctx: Context, inode: Inode, datasync: bool, handle: Handle) -> io::Result<()>2011     fn fsync(&self, _ctx: Context, inode: Inode, datasync: bool, handle: Handle) -> io::Result<()> {
2012         if self.zero_message_open.load(Ordering::Relaxed) {
2013             let data = self.find_inode(inode)?;
2014             self.do_fsync(&*data, datasync)
2015         } else {
2016             let data = self.find_handle(handle, inode)?;
2017 
2018             let file = data.file.lock();
2019             self.do_fsync(&*file, datasync)
2020         }
2021     }
2022 
fsyncdir( &self, _ctx: Context, inode: Inode, datasync: bool, handle: Handle, ) -> io::Result<()>2023     fn fsyncdir(
2024         &self,
2025         _ctx: Context,
2026         inode: Inode,
2027         datasync: bool,
2028         handle: Handle,
2029     ) -> io::Result<()> {
2030         if self.zero_message_opendir.load(Ordering::Relaxed) {
2031             let data = self.find_inode(inode)?;
2032             self.do_fsync(&*data, datasync)
2033         } else {
2034             let data = self.find_handle(handle, inode)?;
2035 
2036             let file = data.file.lock();
2037             self.do_fsync(&*file, datasync)
2038         }
2039     }
2040 
access(&self, ctx: Context, inode: Inode, mask: u32) -> io::Result<()>2041     fn access(&self, ctx: Context, inode: Inode, mask: u32) -> io::Result<()> {
2042         let data = self.find_inode(inode)?;
2043 
2044         let st = stat(&*data)?;
2045         let mode = mask as i32 & (libc::R_OK | libc::W_OK | libc::X_OK);
2046 
2047         if mode == libc::F_OK {
2048             // The file exists since we were able to call `stat(2)` on it.
2049             return Ok(());
2050         }
2051 
2052         if (mode & libc::R_OK) != 0 {
2053             if ctx.uid != 0
2054                 && (st.st_uid != ctx.uid || st.st_mode & 0o400 == 0)
2055                 && (st.st_gid != ctx.gid || st.st_mode & 0o040 == 0)
2056                 && st.st_mode & 0o004 == 0
2057             {
2058                 return Err(io::Error::from_raw_os_error(libc::EACCES));
2059             }
2060         }
2061 
2062         if (mode & libc::W_OK) != 0 {
2063             if ctx.uid != 0
2064                 && (st.st_uid != ctx.uid || st.st_mode & 0o200 == 0)
2065                 && (st.st_gid != ctx.gid || st.st_mode & 0o020 == 0)
2066                 && st.st_mode & 0o002 == 0
2067             {
2068                 return Err(io::Error::from_raw_os_error(libc::EACCES));
2069             }
2070         }
2071 
2072         // root can only execute something if it is executable by one of the owner, the group, or
2073         // everyone.
2074         if (mode & libc::X_OK) != 0 {
2075             if (ctx.uid != 0 || st.st_mode & 0o111 == 0)
2076                 && (st.st_uid != ctx.uid || st.st_mode & 0o100 == 0)
2077                 && (st.st_gid != ctx.gid || st.st_mode & 0o010 == 0)
2078                 && st.st_mode & 0o001 == 0
2079             {
2080                 return Err(io::Error::from_raw_os_error(libc::EACCES));
2081             }
2082         }
2083 
2084         Ok(())
2085     }
2086 
setxattr( &self, _ctx: Context, inode: Inode, name: &CStr, value: &[u8], flags: u32, ) -> io::Result<()>2087     fn setxattr(
2088         &self,
2089         _ctx: Context,
2090         inode: Inode,
2091         name: &CStr,
2092         value: &[u8],
2093         flags: u32,
2094     ) -> io::Result<()> {
2095         // We can't allow the VM to set this xattr because an unprivileged process may use it to set
2096         // a privileged xattr.
2097         if self.cfg.rewrite_security_xattrs && name.to_bytes().starts_with(USER_VIRTIOFS_XATTR) {
2098             return Err(io::Error::from_raw_os_error(libc::EPERM));
2099         }
2100 
2101         let data = self.find_inode(inode)?;
2102         let name = self.rewrite_xattr_name(name);
2103 
2104         if data.filetype == FileType::Other {
2105             // For non-regular files and directories, we cannot open the fd normally. Instead we
2106             // emulate an _at syscall by changing the CWD to /proc, running the path based syscall,
2107             // and then setting the CWD back to the root directory.
2108             let path = CString::new(format!("self/fd/{}", data.as_raw_descriptor()))
2109                 .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
2110 
2111             // Safe because this doesn't modify any memory and we check the return value.
2112             syscall!(self.with_proc_chdir(|| {
2113                 unsafe {
2114                     libc::setxattr(
2115                         path.as_ptr(),
2116                         name.as_ptr(),
2117                         value.as_ptr() as *const libc::c_void,
2118                         value.len() as libc::size_t,
2119                         flags as c_int,
2120                     )
2121                 }
2122             }))?;
2123         } else {
2124             // For regular files and directories, we can just use fsetxattr. Safe because this
2125             // doesn't modify any memory and we check the return value.
2126             syscall!(unsafe {
2127                 libc::fsetxattr(
2128                     data.as_raw_descriptor(),
2129                     name.as_ptr(),
2130                     value.as_ptr() as *const libc::c_void,
2131                     value.len() as libc::size_t,
2132                     flags as c_int,
2133                 )
2134             })?;
2135         }
2136 
2137         Ok(())
2138     }
2139 
getxattr( &self, _ctx: Context, inode: Inode, name: &CStr, size: u32, ) -> io::Result<GetxattrReply>2140     fn getxattr(
2141         &self,
2142         _ctx: Context,
2143         inode: Inode,
2144         name: &CStr,
2145         size: u32,
2146     ) -> io::Result<GetxattrReply> {
2147         // We don't allow the VM to set this xattr so we also pretend there is no value associated
2148         // with it.
2149         if self.cfg.rewrite_security_xattrs && name.to_bytes().starts_with(USER_VIRTIOFS_XATTR) {
2150             return Err(io::Error::from_raw_os_error(libc::ENODATA));
2151         }
2152 
2153         let data = self.find_inode(inode)?;
2154         let name = self.rewrite_xattr_name(name);
2155         let mut buf = vec![0u8; size as usize];
2156 
2157         // Safe because this will only modify the contents of `buf`.
2158         let res = self.do_getxattr(&data, &name, &mut buf[..])?;
2159         if size == 0 {
2160             Ok(GetxattrReply::Count(res as u32))
2161         } else {
2162             buf.truncate(res as usize);
2163             Ok(GetxattrReply::Value(buf))
2164         }
2165     }
2166 
listxattr(&self, _ctx: Context, inode: Inode, size: u32) -> io::Result<ListxattrReply>2167     fn listxattr(&self, _ctx: Context, inode: Inode, size: u32) -> io::Result<ListxattrReply> {
2168         let data = self.find_inode(inode)?;
2169 
2170         let mut buf = vec![0u8; size as usize];
2171 
2172         let res = if data.filetype == FileType::Other {
2173             // For non-regular files and directories, we cannot open the fd normally. Instead we
2174             // emulate an _at syscall by changing the CWD to /proc, running the path based syscall,
2175             // and then setting the CWD back to the root directory.
2176             let path = CString::new(format!("self/fd/{}", data.as_raw_descriptor()))
2177                 .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
2178 
2179             // Safe because this will only modify `buf` and we check the return value.
2180             syscall!(self.with_proc_chdir(|| unsafe {
2181                 libc::listxattr(
2182                     path.as_ptr(),
2183                     buf.as_mut_ptr() as *mut libc::c_char,
2184                     buf.len() as libc::size_t,
2185                 )
2186             }))?
2187         } else {
2188             // For regular files and directories, we can just flistxattr. Safe because this will only
2189             // write to `buf` and we check the return value.
2190             syscall!(unsafe {
2191                 libc::flistxattr(
2192                     data.as_raw_descriptor(),
2193                     buf.as_mut_ptr() as *mut libc::c_char,
2194                     buf.len() as libc::size_t,
2195                 )
2196             })?
2197         };
2198 
2199         if size == 0 {
2200             Ok(ListxattrReply::Count(res as u32))
2201         } else {
2202             buf.truncate(res as usize);
2203 
2204             if self.cfg.rewrite_security_xattrs {
2205                 strip_xattr_prefix(&mut buf);
2206             }
2207             Ok(ListxattrReply::Names(buf))
2208         }
2209     }
2210 
removexattr(&self, _ctx: Context, inode: Inode, name: &CStr) -> io::Result<()>2211     fn removexattr(&self, _ctx: Context, inode: Inode, name: &CStr) -> io::Result<()> {
2212         // We don't allow the VM to set this xattr so we also pretend there is no value associated
2213         // with it.
2214         if self.cfg.rewrite_security_xattrs && name.to_bytes().starts_with(USER_VIRTIOFS_XATTR) {
2215             return Err(io::Error::from_raw_os_error(libc::ENODATA));
2216         }
2217 
2218         let data = self.find_inode(inode)?;
2219         let name = self.rewrite_xattr_name(name);
2220 
2221         if data.filetype == FileType::Other {
2222             // For non-regular files and directories, we cannot open the fd normally. Instead we
2223             // emulate an _at syscall by changing the CWD to /proc, running the path based syscall,
2224             // and then setting the CWD back to the root directory.
2225             let path = CString::new(format!("self/fd/{}", data.as_raw_descriptor()))
2226                 .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
2227 
2228             // Safe because this doesn't modify any memory and we check the return value.
2229             syscall!(
2230                 self.with_proc_chdir(|| unsafe { libc::removexattr(path.as_ptr(), name.as_ptr()) })
2231             )?;
2232         } else {
2233             // For regular files and directories, we can just use fremovexattr. Safe because this
2234             // doesn't modify any memory and we check the return value.
2235             syscall!(unsafe { libc::fremovexattr(data.as_raw_descriptor(), name.as_ptr()) })?;
2236         }
2237 
2238         Ok(())
2239     }
2240 
fallocate( &self, _ctx: Context, inode: Inode, handle: Handle, mode: u32, offset: u64, length: u64, ) -> io::Result<()>2241     fn fallocate(
2242         &self,
2243         _ctx: Context,
2244         inode: Inode,
2245         handle: Handle,
2246         mode: u32,
2247         offset: u64,
2248         length: u64,
2249     ) -> io::Result<()> {
2250         let data: Arc<dyn AsRawDescriptor> = if self.zero_message_open.load(Ordering::Relaxed) {
2251             let data = self.find_inode(inode)?;
2252 
2253             {
2254                 // fallocate needs a writable fd
2255                 let mut file = data.file.lock();
2256                 let mut flags = file.1;
2257                 match flags & libc::O_ACCMODE {
2258                     libc::O_RDONLY => {
2259                         flags &= !libc::O_RDONLY;
2260                         flags |= libc::O_RDWR;
2261 
2262                         // We need to get a writable handle for this file.
2263                         let newfile = self.open_fd(file.0.as_raw_descriptor(), libc::O_RDWR)?;
2264                         *file = (newfile, flags);
2265                     }
2266                     libc::O_WRONLY | libc::O_RDWR => {}
2267                     _ => panic!("Unexpected flags: {:#x}", flags),
2268                 }
2269             }
2270 
2271             data
2272         } else {
2273             self.find_handle(handle, inode)?
2274         };
2275 
2276         let fd = data.as_raw_descriptor();
2277         // Safe because this doesn't modify any memory and we check the return value.
2278         syscall!(unsafe {
2279             libc::fallocate64(
2280                 fd,
2281                 mode as libc::c_int,
2282                 offset as libc::off64_t,
2283                 length as libc::off64_t,
2284             )
2285         })?;
2286 
2287         Ok(())
2288     }
2289 
ioctl<R: io::Read>( &self, ctx: Context, inode: Inode, handle: Handle, _flags: IoctlFlags, cmd: u32, _arg: u64, in_size: u32, out_size: u32, r: R, ) -> io::Result<IoctlReply>2290     fn ioctl<R: io::Read>(
2291         &self,
2292         ctx: Context,
2293         inode: Inode,
2294         handle: Handle,
2295         _flags: IoctlFlags,
2296         cmd: u32,
2297         _arg: u64,
2298         in_size: u32,
2299         out_size: u32,
2300         r: R,
2301     ) -> io::Result<IoctlReply> {
2302         const GET_ENCRYPTION_POLICY_EX: u32 = FS_IOC_GET_ENCRYPTION_POLICY_EX() as u32;
2303         const GET_FSXATTR: u32 = FS_IOC_FSGETXATTR() as u32;
2304         const SET_FSXATTR: u32 = FS_IOC_FSSETXATTR() as u32;
2305         const GET_FLAGS32: u32 = FS_IOC32_GETFLAGS() as u32;
2306         const SET_FLAGS32: u32 = FS_IOC32_SETFLAGS() as u32;
2307         const GET_FLAGS64: u32 = FS_IOC64_GETFLAGS() as u32;
2308         const SET_FLAGS64: u32 = FS_IOC64_SETFLAGS() as u32;
2309         const ENABLE_VERITY: u32 = FS_IOC_ENABLE_VERITY() as u32;
2310         const MEASURE_VERITY: u32 = FS_IOC_MEASURE_VERITY() as u32;
2311 
2312         match cmd {
2313             GET_ENCRYPTION_POLICY_EX => self.get_encryption_policy_ex(inode, handle, r),
2314             GET_FSXATTR => {
2315                 if out_size < size_of::<fsxattr>() as u32 {
2316                     Err(io::Error::from_raw_os_error(libc::ENOMEM))
2317                 } else {
2318                     self.get_fsxattr(inode, handle)
2319                 }
2320             }
2321             SET_FSXATTR => {
2322                 if in_size < size_of::<fsxattr>() as u32 {
2323                     Err(io::Error::from_raw_os_error(libc::EINVAL))
2324                 } else {
2325                     self.set_fsxattr(ctx, inode, handle, r)
2326                 }
2327             }
2328             GET_FLAGS32 | GET_FLAGS64 => {
2329                 if out_size < size_of::<c_int>() as u32 {
2330                     Err(io::Error::from_raw_os_error(libc::ENOMEM))
2331                 } else {
2332                     self.get_flags(inode, handle)
2333                 }
2334             }
2335             SET_FLAGS32 | SET_FLAGS64 => {
2336                 if in_size < size_of::<c_int>() as u32 {
2337                     Err(io::Error::from_raw_os_error(libc::ENOMEM))
2338                 } else {
2339                     self.set_flags(inode, handle, r)
2340                 }
2341             }
2342             ENABLE_VERITY => {
2343                 if in_size < size_of::<fsverity_enable_arg>() as u32 {
2344                     Err(io::Error::from_raw_os_error(libc::ENOMEM))
2345                 } else {
2346                     self.enable_verity(inode, handle, r)
2347                 }
2348             }
2349             MEASURE_VERITY => {
2350                 if in_size < size_of::<fsverity_digest>() as u32
2351                     || out_size < size_of::<fsverity_digest>() as u32
2352                 {
2353                     Err(io::Error::from_raw_os_error(libc::ENOMEM))
2354                 } else {
2355                     self.measure_verity(inode, handle, r, out_size)
2356                 }
2357             }
2358             _ => Err(io::Error::from_raw_os_error(libc::ENOTTY)),
2359         }
2360     }
2361 
copy_file_range( &self, ctx: Context, inode_src: Inode, handle_src: Handle, offset_src: u64, inode_dst: Inode, handle_dst: Handle, offset_dst: u64, length: u64, flags: u64, ) -> io::Result<usize>2362     fn copy_file_range(
2363         &self,
2364         ctx: Context,
2365         inode_src: Inode,
2366         handle_src: Handle,
2367         offset_src: u64,
2368         inode_dst: Inode,
2369         handle_dst: Handle,
2370         offset_dst: u64,
2371         length: u64,
2372         flags: u64,
2373     ) -> io::Result<usize> {
2374         // We need to change credentials during a write so that the kernel will remove setuid or
2375         // setgid bits from the file if it was written to by someone other than the owner.
2376         let (_uid, _gid) = set_creds(ctx.uid, ctx.gid)?;
2377         let (src_data, dst_data): (Arc<dyn AsRawDescriptor>, Arc<dyn AsRawDescriptor>) =
2378             if self.zero_message_open.load(Ordering::Relaxed) {
2379                 (self.find_inode(inode_src)?, self.find_inode(inode_dst)?)
2380             } else {
2381                 (
2382                     self.find_handle(handle_src, inode_src)?,
2383                     self.find_handle(handle_dst, inode_dst)?,
2384                 )
2385             };
2386 
2387         let src = src_data.as_raw_descriptor();
2388         let dst = dst_data.as_raw_descriptor();
2389 
2390         Ok(syscall!(unsafe {
2391             libc::syscall(
2392                 libc::SYS_copy_file_range,
2393                 src,
2394                 &offset_src,
2395                 dst,
2396                 &offset_dst,
2397                 length,
2398                 flags,
2399             )
2400         })? as usize)
2401     }
2402 
set_up_mapping<M: Mapper>( &self, _ctx: Context, inode: Self::Inode, _handle: Self::Handle, file_offset: u64, mem_offset: u64, size: usize, prot: u32, mapper: M, ) -> io::Result<()>2403     fn set_up_mapping<M: Mapper>(
2404         &self,
2405         _ctx: Context,
2406         inode: Self::Inode,
2407         _handle: Self::Handle,
2408         file_offset: u64,
2409         mem_offset: u64,
2410         size: usize,
2411         prot: u32,
2412         mapper: M,
2413     ) -> io::Result<()> {
2414         if !self.cfg.use_dax {
2415             return Err(io::Error::from_raw_os_error(libc::ENOSYS));
2416         }
2417 
2418         let read = prot & libc::PROT_READ as u32 != 0;
2419         let write = prot & libc::PROT_WRITE as u32 != 0;
2420         let mmap_flags = match (read, write) {
2421             (true, true) => libc::O_RDWR,
2422             (true, false) => libc::O_RDONLY,
2423             (false, true) => libc::O_RDWR, // mmap always requires an fd opened for reading.
2424             (false, false) => return Err(io::Error::from_raw_os_error(libc::EINVAL)),
2425         };
2426 
2427         let data = self.find_inode(inode)?;
2428 
2429         if self.zero_message_open.load(Ordering::Relaxed) {
2430             let mut file = data.file.lock();
2431             let mut open_flags = file.1;
2432             match (mmap_flags, open_flags & libc::O_ACCMODE) {
2433                 (libc::O_RDONLY, libc::O_WRONLY)
2434                 | (libc::O_RDWR, libc::O_RDONLY)
2435                 | (libc::O_RDWR, libc::O_WRONLY) => {
2436                     // We have a read-only or write-only fd and we need to upgrade it.
2437                     open_flags &= !libc::O_ACCMODE;
2438                     open_flags |= libc::O_RDWR;
2439 
2440                     let newfile = self.open_fd(file.0.as_raw_descriptor(), libc::O_RDWR)?;
2441                     *file = (newfile, open_flags);
2442                 }
2443                 (libc::O_RDONLY, libc::O_RDONLY)
2444                 | (libc::O_RDONLY, libc::O_RDWR)
2445                 | (libc::O_RDWR, libc::O_RDWR) => {}
2446                 (m, o) => panic!(
2447                     "Unexpected combination of access flags: ({:#x}, {:#x})",
2448                     m, o
2449                 ),
2450             }
2451             mapper.map(mem_offset, size, &file.0, file_offset, prot)
2452         } else {
2453             let file = self.open_inode(&data, mmap_flags | libc::O_NONBLOCK)?;
2454             mapper.map(mem_offset, size, &file, file_offset, prot)
2455         }
2456     }
2457 
remove_mapping<M: Mapper>(&self, msgs: &[RemoveMappingOne], mapper: M) -> io::Result<()>2458     fn remove_mapping<M: Mapper>(&self, msgs: &[RemoveMappingOne], mapper: M) -> io::Result<()> {
2459         if !self.cfg.use_dax {
2460             return Err(io::Error::from_raw_os_error(libc::ENOSYS));
2461         }
2462 
2463         for RemoveMappingOne { moffset, len } in msgs {
2464             mapper.unmap(*moffset, *len)?;
2465         }
2466         Ok(())
2467     }
2468 }
2469 
2470 #[cfg(test)]
2471 mod tests {
2472     use super::*;
2473 
2474     #[test]
rewrite_xattr_names()2475     fn rewrite_xattr_names() {
2476         let cfg = Config {
2477             rewrite_security_xattrs: true,
2478             ..Default::default()
2479         };
2480 
2481         let p = PassthroughFs::new(cfg).expect("Failed to create PassthroughFs");
2482 
2483         // Selinux shouldn't get overwritten.
2484         let selinux = unsafe { CStr::from_bytes_with_nul_unchecked(b"security.selinux\0") };
2485         assert_eq!(p.rewrite_xattr_name(selinux).to_bytes(), selinux.to_bytes());
2486 
2487         // user, trusted, and system should not be changed either.
2488         let user = unsafe { CStr::from_bytes_with_nul_unchecked(b"user.foobar\0") };
2489         assert_eq!(p.rewrite_xattr_name(user).to_bytes(), user.to_bytes());
2490         let trusted = unsafe { CStr::from_bytes_with_nul_unchecked(b"trusted.foobar\0") };
2491         assert_eq!(p.rewrite_xattr_name(trusted).to_bytes(), trusted.to_bytes());
2492         let system = unsafe { CStr::from_bytes_with_nul_unchecked(b"system.foobar\0") };
2493         assert_eq!(p.rewrite_xattr_name(system).to_bytes(), system.to_bytes());
2494 
2495         // sehash should be re-written.
2496         let sehash = unsafe { CStr::from_bytes_with_nul_unchecked(b"security.sehash\0") };
2497         assert_eq!(
2498             p.rewrite_xattr_name(sehash).to_bytes(),
2499             b"user.virtiofs.security.sehash"
2500         );
2501     }
2502 
2503     #[test]
strip_xattr_names()2504     fn strip_xattr_names() {
2505         let only_nuls = b"\0\0\0\0\0";
2506         let mut actual = only_nuls.to_vec();
2507         strip_xattr_prefix(&mut actual);
2508         assert_eq!(&actual[..], &only_nuls[..]);
2509 
2510         let no_nuls = b"security.sehashuser.virtiofs";
2511         let mut actual = no_nuls.to_vec();
2512         strip_xattr_prefix(&mut actual);
2513         assert_eq!(&actual[..], &no_nuls[..]);
2514 
2515         let empty = b"";
2516         let mut actual = empty.to_vec();
2517         strip_xattr_prefix(&mut actual);
2518         assert_eq!(&actual[..], &empty[..]);
2519 
2520         let no_strippable_names = b"security.selinux\0user.foobar\0system.test\0";
2521         let mut actual = no_strippable_names.to_vec();
2522         strip_xattr_prefix(&mut actual);
2523         assert_eq!(&actual[..], &no_strippable_names[..]);
2524 
2525         let only_strippable_names = b"user.virtiofs.security.sehash\0user.virtiofs.security.wat\0";
2526         let mut actual = only_strippable_names.to_vec();
2527         strip_xattr_prefix(&mut actual);
2528         assert_eq!(&actual[..], b"security.sehash\0security.wat\0");
2529 
2530         let mixed_names = b"user.virtiofs.security.sehash\0security.selinux\0user.virtiofs.security.wat\0user.foobar\0";
2531         let mut actual = mixed_names.to_vec();
2532         strip_xattr_prefix(&mut actual);
2533         let expected = b"security.sehash\0security.selinux\0security.wat\0user.foobar\0";
2534         assert_eq!(&actual[..], &expected[..]);
2535 
2536         let no_nul_with_prefix = b"user.virtiofs.security.sehash";
2537         let mut actual = no_nul_with_prefix.to_vec();
2538         strip_xattr_prefix(&mut actual);
2539         assert_eq!(&actual[..], b"security.sehash");
2540     }
2541 }
2542