• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2019 The Chromium OS Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 use std::borrow::Cow;
6 use std::cmp;
7 use std::collections::btree_map;
8 use std::collections::BTreeMap;
9 use std::ffi::{CStr, CString};
10 use std::fs::File;
11 use std::io;
12 use std::mem::{self, size_of, MaybeUninit};
13 use std::os::raw::{c_int, c_long};
14 use std::str::FromStr;
15 use std::sync::atomic::{AtomicBool, AtomicU64, Ordering};
16 use std::sync::Arc;
17 use std::time::Duration;
18 
19 use base::{
20     error, ioctl_ior_nr, ioctl_iow_nr, ioctl_iowr_nr, ioctl_with_mut_ptr, ioctl_with_ptr,
21     AsRawDescriptor, FromRawDescriptor, RawDescriptor,
22 };
23 use data_model::DataInit;
24 use fuse::filesystem::{
25     Context, DirectoryIterator, Entry, FileSystem, FsOptions, GetxattrReply, IoctlFlags,
26     IoctlReply, ListxattrReply, OpenOptions, RemoveMappingOne, SetattrValid, ZeroCopyReader,
27     ZeroCopyWriter, ROOT_ID,
28 };
29 use fuse::sys::WRITE_KILL_PRIV;
30 use fuse::Mapper;
31 use sync::Mutex;
32 
33 use crate::virtio::fs::caps::{Capability, Caps, Set as CapSet, Value as CapValue};
34 use crate::virtio::fs::multikey::MultikeyBTreeMap;
35 use crate::virtio::fs::read_dir::ReadDir;
36 
37 const EMPTY_CSTR: &[u8] = b"\0";
38 const ROOT_CSTR: &[u8] = b"/\0";
39 const PROC_CSTR: &[u8] = b"/proc\0";
40 
41 const USER_VIRTIOFS_XATTR: &[u8] = b"user.virtiofs.";
42 const SECURITY_XATTR: &[u8] = b"security.";
43 const SELINUX_XATTR: &[u8] = b"security.selinux";
44 
45 const FSCRYPT_KEY_DESCRIPTOR_SIZE: usize = 8;
46 const FSCRYPT_KEY_IDENTIFIER_SIZE: usize = 16;
47 
48 #[repr(C)]
49 #[derive(Clone, Copy)]
50 struct fscrypt_policy_v1 {
51     _version: u8,
52     _contents_encryption_mode: u8,
53     _filenames_encryption_mode: u8,
54     _flags: u8,
55     _master_key_descriptor: [u8; FSCRYPT_KEY_DESCRIPTOR_SIZE],
56 }
57 unsafe impl DataInit for fscrypt_policy_v1 {}
58 
59 #[repr(C)]
60 #[derive(Clone, Copy)]
61 struct fscrypt_policy_v2 {
62     _version: u8,
63     _contents_encryption_mode: u8,
64     _filenames_encryption_mode: u8,
65     _flags: u8,
66     __reserved: [u8; 4],
67     master_key_identifier: [u8; FSCRYPT_KEY_IDENTIFIER_SIZE],
68 }
69 unsafe impl DataInit for fscrypt_policy_v2 {}
70 
71 #[repr(C)]
72 #[derive(Copy, Clone)]
73 union fscrypt_policy {
74     _version: u8,
75     _v1: fscrypt_policy_v1,
76     _v2: fscrypt_policy_v2,
77 }
78 unsafe impl DataInit for fscrypt_policy {}
79 
80 #[repr(C)]
81 #[derive(Copy, Clone)]
82 struct fscrypt_get_policy_ex_arg {
83     policy_size: u64,       /* input/output */
84     policy: fscrypt_policy, /* output */
85 }
86 unsafe impl DataInit for fscrypt_get_policy_ex_arg {}
87 
88 ioctl_iowr_nr!(FS_IOC_GET_ENCRYPTION_POLICY_EX, 'f' as u32, 22, [u8; 9]);
89 
90 #[repr(C)]
91 #[derive(Clone, Copy)]
92 struct fsxattr {
93     _fsx_xflags: u32,     /* xflags field value (get/set) */
94     _fsx_extsize: u32,    /* extsize field value (get/set)*/
95     _fsx_nextents: u32,   /* nextents field value (get)	*/
96     _fsx_projid: u32,     /* project identifier (get/set) */
97     _fsx_cowextsize: u32, /* CoW extsize field value (get/set)*/
98     _fsx_pad: [u8; 8],
99 }
100 unsafe impl DataInit for fsxattr {}
101 
102 ioctl_ior_nr!(FS_IOC_FSGETXATTR, 'X' as u32, 31, fsxattr);
103 ioctl_iow_nr!(FS_IOC_FSSETXATTR, 'X' as u32, 32, fsxattr);
104 
105 ioctl_ior_nr!(FS_IOC_GETFLAGS, 'f' as u32, 1, c_long);
106 ioctl_iow_nr!(FS_IOC_SETFLAGS, 'f' as u32, 2, c_long);
107 
108 ioctl_ior_nr!(FS_IOC32_GETFLAGS, 'f' as u32, 1, u32);
109 ioctl_iow_nr!(FS_IOC32_SETFLAGS, 'f' as u32, 2, u32);
110 
111 ioctl_ior_nr!(FS_IOC64_GETFLAGS, 'f' as u32, 1, u64);
112 ioctl_iow_nr!(FS_IOC64_SETFLAGS, 'f' as u32, 2, u64);
113 
114 type Inode = u64;
115 type Handle = u64;
116 
117 #[derive(Clone, Copy, PartialOrd, Ord, PartialEq, Eq)]
118 struct InodeAltKey {
119     ino: libc::ino64_t,
120     dev: libc::dev_t,
121 }
122 
123 #[derive(PartialEq, Eq)]
124 enum FileType {
125     Regular,
126     Directory,
127     Other,
128 }
129 
130 impl From<libc::mode_t> for FileType {
from(mode: libc::mode_t) -> Self131     fn from(mode: libc::mode_t) -> Self {
132         match mode & libc::S_IFMT {
133             libc::S_IFREG => FileType::Regular,
134             libc::S_IFDIR => FileType::Directory,
135             _ => FileType::Other,
136         }
137     }
138 }
139 
140 struct InodeData {
141     inode: Inode,
142     // (File, open_flags)
143     file: Mutex<(File, libc::c_int)>,
144     refcount: AtomicU64,
145     filetype: FileType,
146 }
147 
148 impl AsRawDescriptor for InodeData {
as_raw_descriptor(&self) -> RawDescriptor149     fn as_raw_descriptor(&self) -> RawDescriptor {
150         self.file.lock().0.as_raw_descriptor()
151     }
152 }
153 
154 struct HandleData {
155     inode: Inode,
156     file: Mutex<File>,
157 }
158 
159 impl AsRawDescriptor for HandleData {
as_raw_descriptor(&self) -> RawDescriptor160     fn as_raw_descriptor(&self) -> RawDescriptor {
161         self.file.lock().as_raw_descriptor()
162     }
163 }
164 
165 macro_rules! scoped_cred {
166     ($name:ident, $ty:ty, $syscall_nr:expr) => {
167         #[derive(Debug)]
168         struct $name {
169             old: $ty,
170         }
171 
172         impl $name {
173             // Changes the effective uid/gid of the current thread to `val`. Changes the thread's
174             // credentials back to `old` when the returned struct is dropped.
175             fn new(val: $ty, old: $ty) -> io::Result<Option<$name>> {
176                 if val == old {
177                     // Nothing to do since we already have the correct value.
178                     return Ok(None);
179                 }
180 
181                 // We want credential changes to be per-thread because otherwise
182                 // we might interfere with operations being carried out on other
183                 // threads with different uids/gids.  However, posix requires that
184                 // all threads in a process share the same credentials.  To do this
185                 // libc uses signals to ensure that when one thread changes its
186                 // credentials the other threads do the same thing.
187                 //
188                 // So instead we invoke the syscall directly in order to get around
189                 // this limitation.  Another option is to use the setfsuid and
190                 // setfsgid systems calls.   However since those calls have no way to
191                 // return an error, it's preferable to do this instead.
192 
193                 // This call is safe because it doesn't modify any memory and we
194                 // check the return value.
195                 let res = unsafe { libc::syscall($syscall_nr, -1, val, -1) };
196                 if res == 0 {
197                     Ok(Some($name { old }))
198                 } else {
199                     Err(io::Error::last_os_error())
200                 }
201             }
202         }
203 
204         impl Drop for $name {
205             fn drop(&mut self) {
206                 let res = unsafe { libc::syscall($syscall_nr, -1, self.old, -1) };
207                 if res < 0 {
208                     error!(
209                         "failed to change credentials back to {}: {}",
210                         self.old,
211                         io::Error::last_os_error(),
212                     );
213                 }
214             }
215         }
216     };
217 }
218 #[cfg(not(target_arch = "arm"))]
219 scoped_cred!(ScopedUid, libc::uid_t, libc::SYS_setresuid);
220 #[cfg(target_arch = "arm")]
221 scoped_cred!(ScopedUid, libc::uid_t, libc::SYS_setresuid32);
222 
223 #[cfg(not(target_arch = "arm"))]
224 scoped_cred!(ScopedGid, libc::gid_t, libc::SYS_setresgid);
225 #[cfg(target_arch = "arm")]
226 scoped_cred!(ScopedGid, libc::gid_t, libc::SYS_setresgid32);
227 
228 #[cfg(not(target_arch = "arm"))]
229 const SYS_GETEUID: libc::c_long = libc::SYS_geteuid;
230 #[cfg(target_arch = "arm")]
231 const SYS_GETEUID: libc::c_long = libc::SYS_geteuid32;
232 
233 #[cfg(not(target_arch = "arm"))]
234 const SYS_GETEGID: libc::c_long = libc::SYS_getegid;
235 #[cfg(target_arch = "arm")]
236 const SYS_GETEGID: libc::c_long = libc::SYS_getegid32;
237 
238 thread_local! {
239     // Both these calls are safe because they take no parameters, and only return an integer value.
240     // The kernel also guarantees that they can never fail.
241     static THREAD_EUID: libc::uid_t = unsafe { libc::syscall(SYS_GETEUID) as libc::uid_t };
242     static THREAD_EGID: libc::gid_t = unsafe { libc::syscall(SYS_GETEGID) as libc::gid_t };
243 }
244 
set_creds( uid: libc::uid_t, gid: libc::gid_t, ) -> io::Result<(Option<ScopedUid>, Option<ScopedGid>)>245 fn set_creds(
246     uid: libc::uid_t,
247     gid: libc::gid_t,
248 ) -> io::Result<(Option<ScopedUid>, Option<ScopedGid>)> {
249     let olduid = THREAD_EUID.with(|uid| *uid);
250     let oldgid = THREAD_EGID.with(|gid| *gid);
251 
252     // We have to change the gid before we change the uid because if we change the uid first then we
253     // lose the capability to change the gid.  However changing back can happen in any order.
254     ScopedGid::new(gid, oldgid).and_then(|gid| Ok((ScopedUid::new(uid, olduid)?, gid)))
255 }
256 
257 struct ScopedUmask<'a> {
258     old: libc::mode_t,
259     mask: libc::mode_t,
260     _factory: &'a mut Umask,
261 }
262 
263 impl<'a> Drop for ScopedUmask<'a> {
drop(&mut self)264     fn drop(&mut self) {
265         // Safe because this doesn't modify any memory and always succeeds.
266         let previous = unsafe { libc::umask(self.old) };
267         debug_assert_eq!(
268             previous, self.mask,
269             "umask changed while holding ScopedUmask"
270         );
271     }
272 }
273 
274 struct Umask;
275 
276 impl Umask {
set(&mut self, mask: libc::mode_t) -> ScopedUmask277     fn set(&mut self, mask: libc::mode_t) -> ScopedUmask {
278         ScopedUmask {
279             // Safe because this doesn't modify any memory and always succeeds.
280             old: unsafe { libc::umask(mask) },
281             mask,
282             _factory: self,
283         }
284     }
285 }
286 
287 struct ScopedFsetid(Caps);
288 impl Drop for ScopedFsetid {
drop(&mut self)289     fn drop(&mut self) {
290         if let Err(e) = raise_cap_fsetid(&mut self.0) {
291             error!(
292                 "Failed to restore CAP_FSETID: {}.  Some operations may be broken.",
293                 e
294             )
295         }
296     }
297 }
298 
raise_cap_fsetid(c: &mut Caps) -> io::Result<()>299 fn raise_cap_fsetid(c: &mut Caps) -> io::Result<()> {
300     c.update(&[Capability::Fsetid], CapSet::Effective, CapValue::Set)?;
301     c.apply()
302 }
303 
304 // Drops CAP_FSETID from the effective set for the current thread and returns an RAII guard that
305 // adds the capability back when it is dropped.
drop_cap_fsetid() -> io::Result<ScopedFsetid>306 fn drop_cap_fsetid() -> io::Result<ScopedFsetid> {
307     let mut caps = Caps::for_current_thread()?;
308     caps.update(&[Capability::Fsetid], CapSet::Effective, CapValue::Clear)?;
309     caps.apply()?;
310     Ok(ScopedFsetid(caps))
311 }
312 
ebadf() -> io::Error313 fn ebadf() -> io::Error {
314     io::Error::from_raw_os_error(libc::EBADF)
315 }
316 
stat<F: AsRawDescriptor>(f: &F) -> io::Result<libc::stat64>317 fn stat<F: AsRawDescriptor>(f: &F) -> io::Result<libc::stat64> {
318     let mut st = MaybeUninit::<libc::stat64>::zeroed();
319 
320     // Safe because this is a constant value and a valid C string.
321     let pathname = unsafe { CStr::from_bytes_with_nul_unchecked(EMPTY_CSTR) };
322 
323     // Safe because the kernel will only write data in `st` and we check the return
324     // value.
325     let res = unsafe {
326         libc::fstatat64(
327             f.as_raw_descriptor(),
328             pathname.as_ptr(),
329             st.as_mut_ptr(),
330             libc::AT_EMPTY_PATH | libc::AT_SYMLINK_NOFOLLOW,
331         )
332     };
333     if res >= 0 {
334         // Safe because the kernel guarantees that the struct is now fully initialized.
335         Ok(unsafe { st.assume_init() })
336     } else {
337         Err(io::Error::last_os_error())
338     }
339 }
340 
statat<D: AsRawDescriptor>(dir: &D, name: &CStr) -> io::Result<libc::stat64>341 fn statat<D: AsRawDescriptor>(dir: &D, name: &CStr) -> io::Result<libc::stat64> {
342     let mut st = MaybeUninit::<libc::stat64>::zeroed();
343 
344     // Safe because the kernel will only write data in `st` and we check the return
345     // value.
346     let res = unsafe {
347         libc::fstatat64(
348             dir.as_raw_descriptor(),
349             name.as_ptr(),
350             st.as_mut_ptr(),
351             libc::AT_SYMLINK_NOFOLLOW,
352         )
353     };
354     if res >= 0 {
355         // Safe because the kernel guarantees that the struct is now fully initialized.
356         Ok(unsafe { st.assume_init() })
357     } else {
358         Err(io::Error::last_os_error())
359     }
360 }
361 
362 /// The caching policy that the file system should report to the FUSE client. By default the FUSE
363 /// protocol uses close-to-open consistency. This means that any cached contents of the file are
364 /// invalidated the next time that file is opened.
365 #[derive(Debug, Clone, Eq, PartialEq)]
366 pub enum CachePolicy {
367     /// The client should never cache file data and all I/O should be directly forwarded to the
368     /// server. This policy must be selected when file contents may change without the knowledge of
369     /// the FUSE client (i.e., the file system does not have exclusive access to the directory).
370     Never,
371 
372     /// The client is free to choose when and how to cache file data. This is the default policy and
373     /// uses close-to-open consistency as described in the enum documentation.
374     Auto,
375 
376     /// The client should always cache file data. This means that the FUSE client will not
377     /// invalidate any cached data that was returned by the file system the last time the file was
378     /// opened. This policy should only be selected when the file system has exclusive access to the
379     /// directory.
380     Always,
381 }
382 
383 impl FromStr for CachePolicy {
384     type Err = &'static str;
385 
from_str(s: &str) -> Result<Self, Self::Err>386     fn from_str(s: &str) -> Result<Self, Self::Err> {
387         match s {
388             "never" | "Never" | "NEVER" => Ok(CachePolicy::Never),
389             "auto" | "Auto" | "AUTO" => Ok(CachePolicy::Auto),
390             "always" | "Always" | "ALWAYS" => Ok(CachePolicy::Always),
391             _ => Err("invalid cache policy"),
392         }
393     }
394 }
395 
396 impl Default for CachePolicy {
default() -> Self397     fn default() -> Self {
398         CachePolicy::Auto
399     }
400 }
401 
402 /// Options that configure the behavior of the file system.
403 #[derive(Debug, Clone)]
404 pub struct Config {
405     /// How long the FUSE client should consider directory entries to be valid. If the contents of a
406     /// directory can only be modified by the FUSE client (i.e., the file system has exclusive
407     /// access), then this should be a large value.
408     ///
409     /// The default value for this option is 5 seconds.
410     pub entry_timeout: Duration,
411 
412     /// How long the FUSE client should consider file and directory attributes to be valid. If the
413     /// attributes of a file or directory can only be modified by the FUSE client (i.e., the file
414     /// system has exclusive access), then this should be set to a large value.
415     ///
416     /// The default value for this option is 5 seconds.
417     pub attr_timeout: Duration,
418 
419     /// The caching policy the file system should use. See the documentation of `CachePolicy` for
420     /// more details.
421     pub cache_policy: CachePolicy,
422 
423     /// Whether the file system should enabled writeback caching. This can improve performance as it
424     /// allows the FUSE client to cache and coalesce multiple writes before sending them to the file
425     /// system. However, enabling this option can increase the risk of data corruption if the file
426     /// contents can change without the knowledge of the FUSE client (i.e., the server does **NOT**
427     /// have exclusive access). Additionally, the file system should have read access to all files
428     /// in the directory it is serving as the FUSE client may send read requests even for files
429     /// opened with `O_WRONLY`.
430     ///
431     /// Therefore callers should only enable this option when they can guarantee that: 1) the file
432     /// system has exclusive access to the directory and 2) the file system has read permissions for
433     /// all files in that directory.
434     ///
435     /// The default value for this option is `false`.
436     pub writeback: bool,
437 
438     /// Controls whether security.* xattrs (except for security.selinux) are re-written. When this
439     /// is set to true, the server will add a "user.virtiofs" prefix to xattrs in the security
440     /// namespace. Setting these xattrs requires CAP_SYS_ADMIN in the namespace where the file
441     /// system was mounted and since the server usually runs in an unprivileged user namespace, it's
442     /// unlikely to have that capability.
443     ///
444     /// The default value for this option is `false`.
445     pub rewrite_security_xattrs: bool,
446 
447     /// Use case-insensitive lookups for directory entries (ASCII only).
448     ///
449     /// The default value for this option is `false`.
450     pub ascii_casefold: bool,
451 }
452 
453 impl Default for Config {
default() -> Self454     fn default() -> Self {
455         Config {
456             entry_timeout: Duration::from_secs(5),
457             attr_timeout: Duration::from_secs(5),
458             cache_policy: Default::default(),
459             writeback: false,
460             rewrite_security_xattrs: false,
461             ascii_casefold: false,
462         }
463     }
464 }
465 
466 /// A file system that simply "passes through" all requests it receives to the underlying file
467 /// system. To keep the implementation simple it servers the contents of its root directory. Users
468 /// that wish to serve only a specific directory should set up the environment so that that
469 /// directory ends up as the root of the file system process. One way to accomplish this is via a
470 /// combination of mount namespaces and the pivot_root system call.
471 pub struct PassthroughFs {
472     // File descriptors for various points in the file system tree.
473     inodes: Mutex<MultikeyBTreeMap<Inode, InodeAltKey, Arc<InodeData>>>,
474     next_inode: AtomicU64,
475 
476     // File descriptors for open files and directories. Unlike the fds in `inodes`, these _can_ be
477     // used for reading and writing data.
478     handles: Mutex<BTreeMap<Handle, Arc<HandleData>>>,
479     next_handle: AtomicU64,
480 
481     // File descriptor pointing to the `/proc` directory. This is used to convert an fd from
482     // `inodes` into one that can go into `handles`. This is accomplished by reading the
483     // `self/fd/{}` symlink. We keep an open fd here in case the file system tree that we are meant
484     // to be serving doesn't have access to `/proc`.
485     proc: File,
486 
487     // Whether writeback caching is enabled for this directory. This will only be true when
488     // `cfg.writeback` is true and `init` was called with `FsOptions::WRITEBACK_CACHE`.
489     writeback: AtomicBool,
490 
491     // Whether zero message opens are supported by the kernel driver.
492     zero_message_open: AtomicBool,
493 
494     // Whether zero message opendir is supported by the kernel driver.
495     zero_message_opendir: AtomicBool,
496 
497     // Used to ensure that only one thread at a time uses chdir(). Since chdir() affects the
498     // process-wide CWD, we cannot allow more than one thread to do it at the same time.
499     chdir_mutex: Mutex<()>,
500 
501     // Used when creating files / directories / nodes. Since the umask is process-wide, we can only
502     // allow one thread at a time to change it.
503     umask: Mutex<Umask>,
504 
505     cfg: Config,
506 }
507 
508 impl PassthroughFs {
new(cfg: Config) -> io::Result<PassthroughFs>509     pub fn new(cfg: Config) -> io::Result<PassthroughFs> {
510         // Safe because this is a constant value and a valid C string.
511         let proc_cstr = unsafe { CStr::from_bytes_with_nul_unchecked(PROC_CSTR) };
512 
513         // Safe because this doesn't modify any memory and we check the return value.
514         let raw_descriptor = unsafe {
515             libc::openat(
516                 libc::AT_FDCWD,
517                 proc_cstr.as_ptr(),
518                 libc::O_PATH | libc::O_NOFOLLOW | libc::O_CLOEXEC,
519             )
520         };
521         if raw_descriptor < 0 {
522             return Err(io::Error::last_os_error());
523         }
524 
525         // Safe because we just opened this descriptor.
526         let proc = unsafe { File::from_raw_descriptor(raw_descriptor) };
527 
528         Ok(PassthroughFs {
529             inodes: Mutex::new(MultikeyBTreeMap::new()),
530             next_inode: AtomicU64::new(ROOT_ID + 1),
531 
532             handles: Mutex::new(BTreeMap::new()),
533             next_handle: AtomicU64::new(1),
534 
535             proc,
536 
537             writeback: AtomicBool::new(false),
538             zero_message_open: AtomicBool::new(false),
539             zero_message_opendir: AtomicBool::new(false),
540 
541             chdir_mutex: Mutex::new(()),
542             umask: Mutex::new(Umask),
543             cfg,
544         })
545     }
546 
keep_rds(&self) -> Vec<RawDescriptor>547     pub fn keep_rds(&self) -> Vec<RawDescriptor> {
548         vec![self.proc.as_raw_descriptor()]
549     }
550 
rewrite_xattr_name<'xattr>(&self, name: &'xattr CStr) -> Cow<'xattr, CStr>551     fn rewrite_xattr_name<'xattr>(&self, name: &'xattr CStr) -> Cow<'xattr, CStr> {
552         if !self.cfg.rewrite_security_xattrs {
553             return Cow::Borrowed(name);
554         }
555 
556         // Does not include nul-terminator.
557         let buf = name.to_bytes();
558         if !buf.starts_with(SECURITY_XATTR) || buf == SELINUX_XATTR {
559             return Cow::Borrowed(name);
560         }
561 
562         let mut newname = USER_VIRTIOFS_XATTR.to_vec();
563         newname.extend_from_slice(buf);
564 
565         // The unwrap is safe here because the prefix doesn't contain any interior nul-bytes and the
566         // to_bytes() call above will not return a byte slice with any interior nul-bytes either.
567         Cow::Owned(CString::new(newname).expect("Failed to re-write xattr name"))
568     }
569 
find_inode(&self, inode: Inode) -> io::Result<Arc<InodeData>>570     fn find_inode(&self, inode: Inode) -> io::Result<Arc<InodeData>> {
571         self.inodes
572             .lock()
573             .get(&inode)
574             .map(Arc::clone)
575             .ok_or_else(ebadf)
576     }
577 
find_handle(&self, handle: Handle, inode: Inode) -> io::Result<Arc<HandleData>>578     fn find_handle(&self, handle: Handle, inode: Inode) -> io::Result<Arc<HandleData>> {
579         self.handles
580             .lock()
581             .get(&handle)
582             .filter(|hd| hd.inode == inode)
583             .map(Arc::clone)
584             .ok_or_else(ebadf)
585     }
586 
open_fd(&self, fd: RawDescriptor, flags: i32) -> io::Result<File>587     fn open_fd(&self, fd: RawDescriptor, flags: i32) -> io::Result<File> {
588         let pathname = CString::new(format!("self/fd/{}", fd))
589             .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
590 
591         // Safe because this doesn't modify any memory and we check the return value. We don't
592         // really check `flags` because if the kernel can't handle poorly specified flags then we
593         // have much bigger problems. Also, clear the `O_NOFOLLOW` flag if it is set since we need
594         // to follow the `/proc/self/fd` symlink to get the file.
595         let raw_descriptor = unsafe {
596             libc::openat(
597                 self.proc.as_raw_descriptor(),
598                 pathname.as_ptr(),
599                 (flags | libc::O_CLOEXEC) & !(libc::O_NOFOLLOW | libc::O_DIRECT),
600             )
601         };
602         if raw_descriptor < 0 {
603             return Err(io::Error::last_os_error());
604         }
605 
606         // Safe because we just opened this descriptor.
607         Ok(unsafe { File::from_raw_descriptor(raw_descriptor) })
608     }
609 
open_inode(&self, inode: &InodeData, mut flags: i32) -> io::Result<File>610     fn open_inode(&self, inode: &InodeData, mut flags: i32) -> io::Result<File> {
611         // When writeback caching is enabled, the kernel may send read requests even if the
612         // userspace program opened the file write-only. So we need to ensure that we have opened
613         // the file for reading as well as writing.
614         let writeback = self.writeback.load(Ordering::Relaxed);
615         if writeback && flags & libc::O_ACCMODE == libc::O_WRONLY {
616             flags &= !libc::O_ACCMODE;
617             flags |= libc::O_RDWR;
618         }
619 
620         // When writeback caching is enabled the kernel is responsible for handling `O_APPEND`.
621         // However, this breaks atomicity as the file may have changed on disk, invalidating the
622         // cached copy of the data in the kernel and the offset that the kernel thinks is the end of
623         // the file. Just allow this for now as it is the user's responsibility to enable writeback
624         // caching only for directories that are not shared. It also means that we need to clear the
625         // `O_APPEND` flag.
626         if writeback && flags & libc::O_APPEND != 0 {
627             flags &= !libc::O_APPEND;
628         }
629 
630         self.open_fd(inode.as_raw_descriptor(), flags)
631     }
632 
633     // Creates a new entry for `f` or increases the refcount of the existing entry for `f`.
add_entry(&self, f: File, st: libc::stat64, open_flags: libc::c_int) -> Entry634     fn add_entry(&self, f: File, st: libc::stat64, open_flags: libc::c_int) -> Entry {
635         let altkey = InodeAltKey {
636             ino: st.st_ino,
637             dev: st.st_dev,
638         };
639         let data = self.inodes.lock().get_alt(&altkey).map(Arc::clone);
640 
641         let inode = if let Some(data) = data {
642             // Matches with the release store in `forget`.
643             data.refcount.fetch_add(1, Ordering::Acquire);
644             data.inode
645         } else {
646             // There is a possible race here where 2 threads end up adding the same file
647             // into the inode list.  However, since each of those will get a unique Inode
648             // value and unique file descriptors this shouldn't be that much of a problem.
649             let inode = self.next_inode.fetch_add(1, Ordering::Relaxed);
650             self.inodes.lock().insert(
651                 inode,
652                 InodeAltKey {
653                     ino: st.st_ino,
654                     dev: st.st_dev,
655                 },
656                 Arc::new(InodeData {
657                     inode,
658                     file: Mutex::new((f, open_flags)),
659                     refcount: AtomicU64::new(1),
660                     filetype: st.st_mode.into(),
661                 }),
662             );
663 
664             inode
665         };
666 
667         Entry {
668             inode,
669             generation: 0,
670             attr: st,
671             attr_timeout: self.cfg.attr_timeout,
672             entry_timeout: self.cfg.entry_timeout,
673         }
674     }
675 
676     // Performs an ascii case insensitive lookup.
ascii_casefold_lookup(&self, parent: &InodeData, name: &[u8]) -> io::Result<Entry>677     fn ascii_casefold_lookup(&self, parent: &InodeData, name: &[u8]) -> io::Result<Entry> {
678         let mut buf = [0u8; 1024];
679         let mut offset = 0;
680         loop {
681             let mut read_dir = ReadDir::new(parent, offset, &mut buf[..])?;
682             if read_dir.remaining() == 0 {
683                 break;
684             }
685 
686             while let Some(entry) = read_dir.next() {
687                 offset = entry.offset as libc::off64_t;
688                 if name.eq_ignore_ascii_case(entry.name.to_bytes()) {
689                     return self.do_lookup(parent, entry.name);
690                 }
691             }
692         }
693         Err(io::Error::from_raw_os_error(libc::ENOENT))
694     }
695 
do_lookup(&self, parent: &InodeData, name: &CStr) -> io::Result<Entry>696     fn do_lookup(&self, parent: &InodeData, name: &CStr) -> io::Result<Entry> {
697         let st = statat(parent, name)?;
698 
699         let mut flags = libc::O_RDONLY | libc::O_NOFOLLOW | libc::O_CLOEXEC;
700         match FileType::from(st.st_mode) {
701             FileType::Regular => {}
702             FileType::Directory => flags |= libc::O_DIRECTORY,
703             FileType::Other => flags |= libc::O_PATH,
704         }
705 
706         // Safe because this doesn't modify any memory and we check the return value.
707         let fd = unsafe { libc::openat(parent.as_raw_descriptor(), name.as_ptr(), flags) };
708         if fd < 0 {
709             return Err(io::Error::last_os_error());
710         }
711 
712         // Safe because we just opened this fd.
713         let f = unsafe { File::from_raw_descriptor(fd) };
714 
715         Ok(self.add_entry(f, st, flags))
716     }
717 
do_open(&self, inode: Inode, flags: u32) -> io::Result<(Option<Handle>, OpenOptions)>718     fn do_open(&self, inode: Inode, flags: u32) -> io::Result<(Option<Handle>, OpenOptions)> {
719         let inode_data = self.find_inode(inode)?;
720 
721         let file = Mutex::new(self.open_inode(&inode_data, flags as i32)?);
722 
723         let handle = self.next_handle.fetch_add(1, Ordering::Relaxed);
724         let data = HandleData { inode, file };
725 
726         self.handles.lock().insert(handle, Arc::new(data));
727 
728         let mut opts = OpenOptions::empty();
729         match self.cfg.cache_policy {
730             // We only set the direct I/O option on files.
731             CachePolicy::Never => opts.set(
732                 OpenOptions::DIRECT_IO,
733                 flags & (libc::O_DIRECTORY as u32) == 0,
734             ),
735             CachePolicy::Always => {
736                 opts |= if flags & (libc::O_DIRECTORY as u32) == 0 {
737                     OpenOptions::KEEP_CACHE
738                 } else {
739                     OpenOptions::CACHE_DIR
740                 }
741             }
742             _ => {}
743         };
744 
745         Ok((Some(handle), opts))
746     }
747 
do_release(&self, inode: Inode, handle: Handle) -> io::Result<()>748     fn do_release(&self, inode: Inode, handle: Handle) -> io::Result<()> {
749         let mut handles = self.handles.lock();
750 
751         if let btree_map::Entry::Occupied(e) = handles.entry(handle) {
752             if e.get().inode == inode {
753                 // We don't need to close the file here because that will happen automatically when
754                 // the last `Arc` is dropped.
755                 e.remove();
756                 return Ok(());
757             }
758         }
759 
760         Err(ebadf())
761     }
762 
do_getattr(&self, inode: &InodeData) -> io::Result<(libc::stat64, Duration)>763     fn do_getattr(&self, inode: &InodeData) -> io::Result<(libc::stat64, Duration)> {
764         let st = stat(inode)?;
765 
766         Ok((st, self.cfg.attr_timeout))
767     }
768 
do_unlink(&self, parent: &InodeData, name: &CStr, flags: libc::c_int) -> io::Result<()>769     fn do_unlink(&self, parent: &InodeData, name: &CStr, flags: libc::c_int) -> io::Result<()> {
770         // Safe because this doesn't modify any memory and we check the return value.
771         let res = unsafe { libc::unlinkat(parent.as_raw_descriptor(), name.as_ptr(), flags) };
772         if res == 0 {
773             Ok(())
774         } else {
775             Err(io::Error::last_os_error())
776         }
777     }
778 
do_fsync<F: AsRawDescriptor>(&self, file: &F, datasync: bool) -> io::Result<()>779     fn do_fsync<F: AsRawDescriptor>(&self, file: &F, datasync: bool) -> io::Result<()> {
780         // Safe because this doesn't modify any memory and we check the return value.
781         let res = unsafe {
782             if datasync {
783                 libc::fdatasync(file.as_raw_descriptor())
784             } else {
785                 libc::fsync(file.as_raw_descriptor())
786             }
787         };
788 
789         if res == 0 {
790             Ok(())
791         } else {
792             Err(io::Error::last_os_error())
793         }
794     }
795 
796     // Changes the CWD to `self.proc`, runs `f`, and then changes the CWD back to the root
797     // directory. This effectively emulates an *at syscall starting at /proc, which is useful when
798     // there is no *at syscall available. Panics if any of the fchdir calls fail or if there is no
799     // root inode.
with_proc_chdir<F, T>(&self, f: F) -> T where F: FnOnce() -> T,800     fn with_proc_chdir<F, T>(&self, f: F) -> T
801     where
802         F: FnOnce() -> T,
803     {
804         let root = self.find_inode(ROOT_ID).expect("failed to find root inode");
805         let chdir_lock = self.chdir_mutex.lock();
806 
807         // Safe because this doesn't modify any memory and we check the return value. Since the
808         // fchdir should never fail we just use debug_asserts.
809         let proc_cwd = unsafe { libc::fchdir(self.proc.as_raw_descriptor()) };
810         debug_assert_eq!(
811             proc_cwd,
812             0,
813             "failed to fchdir to /proc: {}",
814             io::Error::last_os_error()
815         );
816 
817         let res = f();
818 
819         // Safe because this doesn't modify any memory and we check the return value. Since the
820         // fchdir should never fail we just use debug_asserts.
821         let root_cwd = unsafe { libc::fchdir(root.as_raw_descriptor()) };
822         debug_assert_eq!(
823             root_cwd,
824             0,
825             "failed to fchdir back to root directory: {}",
826             io::Error::last_os_error()
827         );
828 
829         mem::drop(chdir_lock);
830         res
831     }
832 
do_getxattr(&self, inode: &InodeData, name: &CStr, value: &mut [u8]) -> io::Result<usize>833     fn do_getxattr(&self, inode: &InodeData, name: &CStr, value: &mut [u8]) -> io::Result<usize> {
834         let res = if inode.filetype == FileType::Other {
835             // For non-regular files and directories, we cannot open the fd normally. Instead we
836             // emulate an _at syscall by changing the CWD to /proc, running the path based syscall,
837             // and then setting the CWD back to the root directory.
838             let path = CString::new(format!("self/fd/{}", inode.as_raw_descriptor()))
839                 .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
840 
841             // Safe because this will only modify `value` and we check the return value.
842             self.with_proc_chdir(|| unsafe {
843                 libc::getxattr(
844                     path.as_ptr(),
845                     name.as_ptr(),
846                     value.as_mut_ptr() as *mut libc::c_void,
847                     value.len() as libc::size_t,
848                 )
849             })
850         } else {
851             // For regular files and directories, we can just use fgetxattr. Safe because this will
852             // only write to `value` and we check the return value.
853             unsafe {
854                 libc::fgetxattr(
855                     inode.as_raw_descriptor(),
856                     name.as_ptr(),
857                     value.as_mut_ptr() as *mut libc::c_void,
858                     value.len() as libc::size_t,
859                 )
860             }
861         };
862 
863         if res < 0 {
864             Err(io::Error::last_os_error())
865         } else {
866             Ok(res as usize)
867         }
868     }
869 
get_encryption_policy_ex<R: io::Read>( &self, inode: Inode, handle: Handle, mut r: R, ) -> io::Result<IoctlReply>870     fn get_encryption_policy_ex<R: io::Read>(
871         &self,
872         inode: Inode,
873         handle: Handle,
874         mut r: R,
875     ) -> io::Result<IoctlReply> {
876         let data: Arc<dyn AsRawDescriptor> = if self.zero_message_open.load(Ordering::Relaxed) {
877             self.find_inode(inode)?
878         } else {
879             self.find_handle(handle, inode)?
880         };
881 
882         // Safe because this only has integer fields.
883         let mut arg = unsafe { MaybeUninit::<fscrypt_get_policy_ex_arg>::zeroed().assume_init() };
884         r.read_exact(arg.policy_size.as_mut_slice())?;
885 
886         let policy_size = cmp::min(arg.policy_size, size_of::<fscrypt_policy>() as u64);
887         arg.policy_size = policy_size;
888 
889         // Safe because the kernel will only write to `arg` and we check the return value.
890         let res =
891             unsafe { ioctl_with_mut_ptr(&*data, FS_IOC_GET_ENCRYPTION_POLICY_EX(), &mut arg) };
892         if res < 0 {
893             Ok(IoctlReply::Done(Err(io::Error::last_os_error())))
894         } else {
895             let len = size_of::<u64>() + arg.policy_size as usize;
896             Ok(IoctlReply::Done(Ok(arg.as_slice()[..len].to_vec())))
897         }
898     }
899 
get_fsxattr(&self, inode: Inode, handle: Handle) -> io::Result<IoctlReply>900     fn get_fsxattr(&self, inode: Inode, handle: Handle) -> io::Result<IoctlReply> {
901         let data: Arc<dyn AsRawDescriptor> = if self.zero_message_open.load(Ordering::Relaxed) {
902             self.find_inode(inode)?
903         } else {
904             self.find_handle(handle, inode)?
905         };
906 
907         let mut buf = MaybeUninit::<fsxattr>::zeroed();
908 
909         // Safe because the kernel will only write to `buf` and we check the return value.
910         let res = unsafe { ioctl_with_mut_ptr(&*data, FS_IOC_FSGETXATTR(), buf.as_mut_ptr()) };
911         if res < 0 {
912             Ok(IoctlReply::Done(Err(io::Error::last_os_error())))
913         } else {
914             // Safe because the kernel guarantees that the policy is now initialized.
915             let xattr = unsafe { buf.assume_init() };
916             Ok(IoctlReply::Done(Ok(xattr.as_slice().to_vec())))
917         }
918     }
919 
set_fsxattr<R: io::Read>( &self, inode: Inode, handle: Handle, r: R, ) -> io::Result<IoctlReply>920     fn set_fsxattr<R: io::Read>(
921         &self,
922         inode: Inode,
923         handle: Handle,
924         r: R,
925     ) -> io::Result<IoctlReply> {
926         let data: Arc<dyn AsRawDescriptor> = if self.zero_message_open.load(Ordering::Relaxed) {
927             self.find_inode(inode)?
928         } else {
929             self.find_handle(handle, inode)?
930         };
931 
932         let attr = fsxattr::from_reader(r)?;
933 
934         //  Safe because this doesn't modify any memory and we check the return value.
935         let res = unsafe { ioctl_with_ptr(&*data, FS_IOC_FSSETXATTR(), &attr) };
936         if res < 0 {
937             Ok(IoctlReply::Done(Err(io::Error::last_os_error())))
938         } else {
939             Ok(IoctlReply::Done(Ok(Vec::new())))
940         }
941     }
942 
get_flags(&self, inode: Inode, handle: Handle) -> io::Result<IoctlReply>943     fn get_flags(&self, inode: Inode, handle: Handle) -> io::Result<IoctlReply> {
944         let data: Arc<dyn AsRawDescriptor> = if self.zero_message_open.load(Ordering::Relaxed) {
945             self.find_inode(inode)?
946         } else {
947             self.find_handle(handle, inode)?
948         };
949 
950         // The ioctl encoding is a long but the parameter is actually an int.
951         let mut flags: c_int = 0;
952 
953         // Safe because the kernel will only write to `flags` and we check the return value.
954         let res = unsafe { ioctl_with_mut_ptr(&*data, FS_IOC_GETFLAGS(), &mut flags) };
955         if res < 0 {
956             Ok(IoctlReply::Done(Err(io::Error::last_os_error())))
957         } else {
958             Ok(IoctlReply::Done(Ok(flags.to_ne_bytes().to_vec())))
959         }
960     }
961 
set_flags<R: io::Read>(&self, inode: Inode, handle: Handle, r: R) -> io::Result<IoctlReply>962     fn set_flags<R: io::Read>(&self, inode: Inode, handle: Handle, r: R) -> io::Result<IoctlReply> {
963         let data: Arc<dyn AsRawDescriptor> = if self.zero_message_open.load(Ordering::Relaxed) {
964             self.find_inode(inode)?
965         } else {
966             self.find_handle(handle, inode)?
967         };
968 
969         // The ioctl encoding is a long but the parameter is actually an int.
970         let flags = c_int::from_reader(r)?;
971 
972         // Safe because this doesn't modify any memory and we check the return value.
973         let res = unsafe { ioctl_with_ptr(&*data, FS_IOC_SETFLAGS(), &flags) };
974         if res < 0 {
975             Ok(IoctlReply::Done(Err(io::Error::last_os_error())))
976         } else {
977             Ok(IoctlReply::Done(Ok(Vec::new())))
978         }
979     }
980 }
981 
forget_one( inodes: &mut MultikeyBTreeMap<Inode, InodeAltKey, Arc<InodeData>>, inode: Inode, count: u64, )982 fn forget_one(
983     inodes: &mut MultikeyBTreeMap<Inode, InodeAltKey, Arc<InodeData>>,
984     inode: Inode,
985     count: u64,
986 ) {
987     if let Some(data) = inodes.get(&inode) {
988         // Acquiring the write lock on the inode map prevents new lookups from incrementing the
989         // refcount but there is the possibility that a previous lookup already acquired a
990         // reference to the inode data and is in the process of updating the refcount so we need
991         // to loop here until we can decrement successfully.
992         loop {
993             let refcount = data.refcount.load(Ordering::Relaxed);
994 
995             // Saturating sub because it doesn't make sense for a refcount to go below zero and
996             // we don't want misbehaving clients to cause integer overflow.
997             let new_count = refcount.saturating_sub(count);
998 
999             // Synchronizes with the acquire load in `do_lookup`.
1000             if data
1001                 .refcount
1002                 .compare_exchange_weak(refcount, new_count, Ordering::Release, Ordering::Relaxed)
1003                 .is_ok()
1004             {
1005                 if new_count == 0 {
1006                     // We just removed the last refcount for this inode. There's no need for an
1007                     // acquire fence here because we hold a write lock on the inode map and any
1008                     // thread that is waiting to do a forget on the same inode will have to wait
1009                     // until we release the lock. So there's is no other release store for us to
1010                     // synchronize with before deleting the entry.
1011                     inodes.remove(&inode);
1012                 }
1013                 break;
1014             }
1015         }
1016     }
1017 }
1018 
1019 // Strips any `user.virtiofs.` prefix from `buf`. If buf contains one or more nul-bytes, each
1020 // nul-byte-separated slice is treated as a C string and the prefix is stripped from each one.
strip_xattr_prefix(buf: &mut Vec<u8>)1021 fn strip_xattr_prefix(buf: &mut Vec<u8>) {
1022     fn next_cstr(b: &[u8], start: usize) -> Option<&[u8]> {
1023         if start >= b.len() {
1024             return None;
1025         }
1026 
1027         let end = b[start..]
1028             .iter()
1029             .position(|&c| c == b'\0')
1030             .map(|p| start + p + 1)
1031             .unwrap_or(b.len());
1032 
1033         Some(&b[start..end])
1034     }
1035 
1036     let mut pos = 0;
1037     while let Some(name) = next_cstr(&buf, pos) {
1038         if !name.starts_with(USER_VIRTIOFS_XATTR) {
1039             pos += name.len();
1040             continue;
1041         }
1042 
1043         let newlen = name.len() - USER_VIRTIOFS_XATTR.len();
1044         buf.drain(pos..pos + USER_VIRTIOFS_XATTR.len());
1045         pos += newlen;
1046     }
1047 }
1048 
1049 impl FileSystem for PassthroughFs {
1050     type Inode = Inode;
1051     type Handle = Handle;
1052     type DirIter = ReadDir<Box<[u8]>>;
1053 
init(&self, capable: FsOptions) -> io::Result<FsOptions>1054     fn init(&self, capable: FsOptions) -> io::Result<FsOptions> {
1055         // Safe because this is a constant value and a valid C string.
1056         let root = unsafe { CStr::from_bytes_with_nul_unchecked(ROOT_CSTR) };
1057 
1058         let flags = libc::O_DIRECTORY | libc::O_NOFOLLOW | libc::O_CLOEXEC;
1059         // Safe because this doesn't modify any memory and we check the return value.
1060         let raw_descriptor = unsafe { libc::openat(libc::AT_FDCWD, root.as_ptr(), flags) };
1061         if raw_descriptor < 0 {
1062             return Err(io::Error::last_os_error());
1063         }
1064 
1065         // Safe because we just opened this descriptor above.
1066         let f = unsafe { File::from_raw_descriptor(raw_descriptor) };
1067 
1068         let st = stat(&f)?;
1069 
1070         // Safe because this doesn't modify any memory and there is no need to check the return
1071         // value because this system call always succeeds. We need to clear the umask here because
1072         // we want the client to be able to set all the bits in the mode.
1073         unsafe { libc::umask(0o000) };
1074 
1075         let mut inodes = self.inodes.lock();
1076 
1077         // Not sure why the root inode gets a refcount of 2 but that's what libfuse does.
1078         inodes.insert(
1079             ROOT_ID,
1080             InodeAltKey {
1081                 ino: st.st_ino,
1082                 dev: st.st_dev,
1083             },
1084             Arc::new(InodeData {
1085                 inode: ROOT_ID,
1086                 file: Mutex::new((f, flags)),
1087                 refcount: AtomicU64::new(2),
1088                 filetype: st.st_mode.into(),
1089             }),
1090         );
1091 
1092         let mut opts = FsOptions::DO_READDIRPLUS
1093             | FsOptions::READDIRPLUS_AUTO
1094             | FsOptions::EXPORT_SUPPORT
1095             | FsOptions::DONT_MASK
1096             | FsOptions::POSIX_ACL;
1097         if self.cfg.writeback && capable.contains(FsOptions::WRITEBACK_CACHE) {
1098             opts |= FsOptions::WRITEBACK_CACHE;
1099             self.writeback.store(true, Ordering::Relaxed);
1100         }
1101         if self.cfg.cache_policy == CachePolicy::Always {
1102             if capable.contains(FsOptions::ZERO_MESSAGE_OPEN) {
1103                 opts |= FsOptions::ZERO_MESSAGE_OPEN;
1104                 self.zero_message_open.store(true, Ordering::Relaxed);
1105             }
1106             if capable.contains(FsOptions::ZERO_MESSAGE_OPENDIR) {
1107                 opts |= FsOptions::ZERO_MESSAGE_OPENDIR;
1108                 self.zero_message_opendir.store(true, Ordering::Relaxed);
1109             }
1110         }
1111         Ok(opts)
1112     }
1113 
destroy(&self)1114     fn destroy(&self) {
1115         self.handles.lock().clear();
1116         self.inodes.lock().clear();
1117     }
1118 
statfs(&self, _ctx: Context, inode: Inode) -> io::Result<libc::statvfs64>1119     fn statfs(&self, _ctx: Context, inode: Inode) -> io::Result<libc::statvfs64> {
1120         let data = self.find_inode(inode)?;
1121 
1122         let mut out = MaybeUninit::<libc::statvfs64>::zeroed();
1123 
1124         // Safe because this will only modify `out` and we check the return value.
1125         let res = unsafe { libc::fstatvfs64(data.as_raw_descriptor(), out.as_mut_ptr()) };
1126         if res == 0 {
1127             // Safe because the kernel guarantees that `out` has been initialized.
1128             Ok(unsafe { out.assume_init() })
1129         } else {
1130             Err(io::Error::last_os_error())
1131         }
1132     }
1133 
lookup(&self, _ctx: Context, parent: Inode, name: &CStr) -> io::Result<Entry>1134     fn lookup(&self, _ctx: Context, parent: Inode, name: &CStr) -> io::Result<Entry> {
1135         let data = self.find_inode(parent)?;
1136         self.do_lookup(&data, name).or_else(|e| {
1137             if self.cfg.ascii_casefold {
1138                 self.ascii_casefold_lookup(&data, name.to_bytes())
1139             } else {
1140                 Err(e)
1141             }
1142         })
1143     }
1144 
forget(&self, _ctx: Context, inode: Inode, count: u64)1145     fn forget(&self, _ctx: Context, inode: Inode, count: u64) {
1146         let mut inodes = self.inodes.lock();
1147 
1148         forget_one(&mut inodes, inode, count)
1149     }
1150 
batch_forget(&self, _ctx: Context, requests: Vec<(Inode, u64)>)1151     fn batch_forget(&self, _ctx: Context, requests: Vec<(Inode, u64)>) {
1152         let mut inodes = self.inodes.lock();
1153 
1154         for (inode, count) in requests {
1155             forget_one(&mut inodes, inode, count)
1156         }
1157     }
1158 
opendir( &self, _ctx: Context, inode: Inode, flags: u32, ) -> io::Result<(Option<Handle>, OpenOptions)>1159     fn opendir(
1160         &self,
1161         _ctx: Context,
1162         inode: Inode,
1163         flags: u32,
1164     ) -> io::Result<(Option<Handle>, OpenOptions)> {
1165         if self.zero_message_opendir.load(Ordering::Relaxed) {
1166             Err(io::Error::from_raw_os_error(libc::ENOSYS))
1167         } else {
1168             self.do_open(inode, flags | (libc::O_DIRECTORY as u32))
1169         }
1170     }
1171 
releasedir( &self, _ctx: Context, inode: Inode, _flags: u32, handle: Handle, ) -> io::Result<()>1172     fn releasedir(
1173         &self,
1174         _ctx: Context,
1175         inode: Inode,
1176         _flags: u32,
1177         handle: Handle,
1178     ) -> io::Result<()> {
1179         if self.zero_message_opendir.load(Ordering::Relaxed) {
1180             Ok(())
1181         } else {
1182             self.do_release(inode, handle)
1183         }
1184     }
1185 
mkdir( &self, ctx: Context, parent: Inode, name: &CStr, mode: u32, umask: u32, ) -> io::Result<Entry>1186     fn mkdir(
1187         &self,
1188         ctx: Context,
1189         parent: Inode,
1190         name: &CStr,
1191         mode: u32,
1192         umask: u32,
1193     ) -> io::Result<Entry> {
1194         let data = self.find_inode(parent)?;
1195 
1196         let (_uid, _gid) = set_creds(ctx.uid, ctx.gid)?;
1197         let res = {
1198             let mut um = self.umask.lock();
1199             let _scoped_umask = um.set(umask);
1200 
1201             // Safe because this doesn't modify any memory and we check the return value.
1202             unsafe { libc::mkdirat(data.as_raw_descriptor(), name.as_ptr(), mode) }
1203         };
1204         if res == 0 {
1205             self.do_lookup(&data, name)
1206         } else {
1207             Err(io::Error::last_os_error())
1208         }
1209     }
1210 
rmdir(&self, _ctx: Context, parent: Inode, name: &CStr) -> io::Result<()>1211     fn rmdir(&self, _ctx: Context, parent: Inode, name: &CStr) -> io::Result<()> {
1212         let data = self.find_inode(parent)?;
1213         self.do_unlink(&data, name, libc::AT_REMOVEDIR)
1214     }
1215 
readdir( &self, _ctx: Context, inode: Inode, handle: Handle, size: u32, offset: u64, ) -> io::Result<Self::DirIter>1216     fn readdir(
1217         &self,
1218         _ctx: Context,
1219         inode: Inode,
1220         handle: Handle,
1221         size: u32,
1222         offset: u64,
1223     ) -> io::Result<Self::DirIter> {
1224         let buf = vec![0; size as usize].into_boxed_slice();
1225 
1226         if self.zero_message_opendir.load(Ordering::Relaxed) {
1227             let data = self.find_inode(inode)?;
1228             ReadDir::new(&*data, offset as libc::off64_t, buf)
1229         } else {
1230             let data = self.find_handle(handle, inode)?;
1231 
1232             let dir = data.file.lock();
1233 
1234             ReadDir::new(&*dir, offset as libc::off64_t, buf)
1235         }
1236     }
1237 
open( &self, _ctx: Context, inode: Inode, flags: u32, ) -> io::Result<(Option<Handle>, OpenOptions)>1238     fn open(
1239         &self,
1240         _ctx: Context,
1241         inode: Inode,
1242         flags: u32,
1243     ) -> io::Result<(Option<Handle>, OpenOptions)> {
1244         if self.zero_message_open.load(Ordering::Relaxed) {
1245             Err(io::Error::from_raw_os_error(libc::ENOSYS))
1246         } else {
1247             self.do_open(inode, flags)
1248         }
1249     }
1250 
release( &self, _ctx: Context, inode: Inode, _flags: u32, handle: Handle, _flush: bool, _flock_release: bool, _lock_owner: Option<u64>, ) -> io::Result<()>1251     fn release(
1252         &self,
1253         _ctx: Context,
1254         inode: Inode,
1255         _flags: u32,
1256         handle: Handle,
1257         _flush: bool,
1258         _flock_release: bool,
1259         _lock_owner: Option<u64>,
1260     ) -> io::Result<()> {
1261         if self.zero_message_open.load(Ordering::Relaxed) {
1262             Ok(())
1263         } else {
1264             self.do_release(inode, handle)
1265         }
1266     }
1267 
chromeos_tmpfile( &self, ctx: Context, parent: Self::Inode, mode: u32, umask: u32, ) -> io::Result<Entry>1268     fn chromeos_tmpfile(
1269         &self,
1270         ctx: Context,
1271         parent: Self::Inode,
1272         mode: u32,
1273         umask: u32,
1274     ) -> io::Result<Entry> {
1275         let data = self.find_inode(parent)?;
1276 
1277         let (_uid, _gid) = set_creds(ctx.uid, ctx.gid)?;
1278 
1279         let tmpflags = libc::O_RDWR | libc::O_TMPFILE | libc::O_CLOEXEC | libc::O_NOFOLLOW;
1280 
1281         // Safe because this is a valid c string.
1282         let current_dir = unsafe { CStr::from_bytes_with_nul_unchecked(b".\0") };
1283 
1284         let fd = {
1285             let mut um = self.umask.lock();
1286             let _scoped_umask = um.set(umask);
1287 
1288             // Safe because this doesn't modify any memory and we check the return value.
1289             unsafe {
1290                 libc::openat(
1291                     data.as_raw_descriptor(),
1292                     current_dir.as_ptr(),
1293                     tmpflags,
1294                     mode,
1295                 )
1296             }
1297         };
1298         if fd < 0 {
1299             return Err(io::Error::last_os_error());
1300         }
1301 
1302         // Safe because we just opened this fd.
1303         let tmpfile = unsafe { File::from_raw_descriptor(fd) };
1304 
1305         let st = stat(&tmpfile)?;
1306         Ok(self.add_entry(tmpfile, st, tmpflags))
1307     }
1308 
create( &self, ctx: Context, parent: Inode, name: &CStr, mode: u32, flags: u32, umask: u32, ) -> io::Result<(Entry, Option<Handle>, OpenOptions)>1309     fn create(
1310         &self,
1311         ctx: Context,
1312         parent: Inode,
1313         name: &CStr,
1314         mode: u32,
1315         flags: u32,
1316         umask: u32,
1317     ) -> io::Result<(Entry, Option<Handle>, OpenOptions)> {
1318         let data = self.find_inode(parent)?;
1319 
1320         let (_uid, _gid) = set_creds(ctx.uid, ctx.gid)?;
1321 
1322         let create_flags =
1323             (flags as i32 | libc::O_CREAT | libc::O_CLOEXEC | libc::O_NOFOLLOW) & !libc::O_DIRECT;
1324 
1325         let fd = {
1326             let mut um = self.umask.lock();
1327             let _scoped_umask = um.set(umask);
1328 
1329             // Safe because this doesn't modify any memory and we check the return value. We don't
1330             // really check `flags` because if the kernel can't handle poorly specified flags then
1331             // we have much bigger problems.
1332             unsafe { libc::openat(data.as_raw_descriptor(), name.as_ptr(), create_flags, mode) }
1333         };
1334         if fd < 0 {
1335             return Err(io::Error::last_os_error());
1336         }
1337 
1338         // Safe because we just opened this fd.
1339         let file = unsafe { File::from_raw_descriptor(fd) };
1340 
1341         let st = stat(&file)?;
1342         let entry = self.add_entry(file, st, create_flags);
1343 
1344         let (handle, opts) = if self.zero_message_open.load(Ordering::Relaxed) {
1345             (None, OpenOptions::KEEP_CACHE)
1346         } else {
1347             self.do_open(
1348                 entry.inode,
1349                 flags & !((libc::O_CREAT | libc::O_EXCL | libc::O_NOCTTY) as u32),
1350             )
1351             .map_err(|e| {
1352                 // Don't leak the entry.
1353                 self.forget(ctx, entry.inode, 1);
1354                 e
1355             })?
1356         };
1357 
1358         Ok((entry, handle, opts))
1359     }
1360 
unlink(&self, _ctx: Context, parent: Inode, name: &CStr) -> io::Result<()>1361     fn unlink(&self, _ctx: Context, parent: Inode, name: &CStr) -> io::Result<()> {
1362         let data = self.find_inode(parent)?;
1363         self.do_unlink(&data, name, 0)
1364     }
1365 
read<W: io::Write + ZeroCopyWriter>( &self, _ctx: Context, inode: Inode, handle: Handle, mut w: W, size: u32, offset: u64, _lock_owner: Option<u64>, _flags: u32, ) -> io::Result<usize>1366     fn read<W: io::Write + ZeroCopyWriter>(
1367         &self,
1368         _ctx: Context,
1369         inode: Inode,
1370         handle: Handle,
1371         mut w: W,
1372         size: u32,
1373         offset: u64,
1374         _lock_owner: Option<u64>,
1375         _flags: u32,
1376     ) -> io::Result<usize> {
1377         if self.zero_message_open.load(Ordering::Relaxed) {
1378             let data = self.find_inode(inode)?;
1379 
1380             let mut file = data.file.lock();
1381             let mut flags = file.1;
1382             match flags & libc::O_ACCMODE {
1383                 libc::O_WRONLY => {
1384                     flags &= !libc::O_WRONLY;
1385                     flags |= libc::O_RDWR;
1386 
1387                     // We need to get a readable handle for this file.
1388                     let newfile = self.open_fd(file.0.as_raw_descriptor(), libc::O_RDWR)?;
1389                     *file = (newfile, flags);
1390                 }
1391                 libc::O_RDONLY | libc::O_RDWR => {}
1392                 _ => panic!("Unexpected flags: {:#x}", flags),
1393             }
1394 
1395             w.write_from(&mut file.0, size as usize, offset)
1396         } else {
1397             let data = self.find_handle(handle, inode)?;
1398 
1399             let mut f = data.file.lock();
1400             w.write_from(&mut f, size as usize, offset)
1401         }
1402     }
1403 
write<R: io::Read + ZeroCopyReader>( &self, _ctx: Context, inode: Inode, handle: Handle, mut r: R, size: u32, offset: u64, _lock_owner: Option<u64>, _delayed_write: bool, flags: u32, ) -> io::Result<usize>1404     fn write<R: io::Read + ZeroCopyReader>(
1405         &self,
1406         _ctx: Context,
1407         inode: Inode,
1408         handle: Handle,
1409         mut r: R,
1410         size: u32,
1411         offset: u64,
1412         _lock_owner: Option<u64>,
1413         _delayed_write: bool,
1414         flags: u32,
1415     ) -> io::Result<usize> {
1416         // When the WRITE_KILL_PRIV flag is set, drop CAP_FSETID so that the kernel will
1417         // automatically clear the setuid and setgid bits for us.
1418         let _fsetid = if flags & WRITE_KILL_PRIV != 0 {
1419             Some(drop_cap_fsetid()?)
1420         } else {
1421             None
1422         };
1423 
1424         if self.zero_message_open.load(Ordering::Relaxed) {
1425             let data = self.find_inode(inode)?;
1426 
1427             let mut file = data.file.lock();
1428             let mut flags = file.1;
1429             match flags & libc::O_ACCMODE {
1430                 libc::O_RDONLY => {
1431                     flags &= !libc::O_RDONLY;
1432                     flags |= libc::O_RDWR;
1433 
1434                     // We need to get a writable handle for this file.
1435                     let newfile = self.open_fd(file.0.as_raw_descriptor(), libc::O_RDWR)?;
1436                     *file = (newfile, flags);
1437                 }
1438                 libc::O_WRONLY | libc::O_RDWR => {}
1439                 _ => panic!("Unexpected flags: {:#x}", flags),
1440             }
1441 
1442             r.read_to(&mut file.0, size as usize, offset)
1443         } else {
1444             let data = self.find_handle(handle, inode)?;
1445 
1446             let mut f = data.file.lock();
1447             r.read_to(&mut f, size as usize, offset)
1448         }
1449     }
1450 
getattr( &self, _ctx: Context, inode: Inode, _handle: Option<Handle>, ) -> io::Result<(libc::stat64, Duration)>1451     fn getattr(
1452         &self,
1453         _ctx: Context,
1454         inode: Inode,
1455         _handle: Option<Handle>,
1456     ) -> io::Result<(libc::stat64, Duration)> {
1457         let data = self.find_inode(inode)?;
1458         self.do_getattr(&data)
1459     }
1460 
setattr( &self, _ctx: Context, inode: Inode, attr: libc::stat64, handle: Option<Handle>, valid: SetattrValid, ) -> io::Result<(libc::stat64, Duration)>1461     fn setattr(
1462         &self,
1463         _ctx: Context,
1464         inode: Inode,
1465         attr: libc::stat64,
1466         handle: Option<Handle>,
1467         valid: SetattrValid,
1468     ) -> io::Result<(libc::stat64, Duration)> {
1469         let inode_data = self.find_inode(inode)?;
1470 
1471         enum Data {
1472             Handle(Arc<HandleData>, RawDescriptor),
1473             ProcPath(CString),
1474         }
1475 
1476         // If we have a handle then use it otherwise get a new fd from the inode.
1477         let data = if let Some(handle) = handle.filter(|&h| h != 0) {
1478             let hd = self.find_handle(handle, inode)?;
1479 
1480             let fd = hd.file.lock().as_raw_descriptor();
1481             Data::Handle(hd, fd)
1482         } else {
1483             let pathname = CString::new(format!("self/fd/{}", inode_data.as_raw_descriptor()))
1484                 .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
1485             Data::ProcPath(pathname)
1486         };
1487 
1488         if valid.contains(SetattrValid::MODE) {
1489             // Safe because this doesn't modify any memory and we check the return value.
1490             let res = unsafe {
1491                 match data {
1492                     Data::Handle(_, fd) => libc::fchmod(fd, attr.st_mode),
1493                     Data::ProcPath(ref p) => {
1494                         libc::fchmodat(self.proc.as_raw_descriptor(), p.as_ptr(), attr.st_mode, 0)
1495                     }
1496                 }
1497             };
1498             if res < 0 {
1499                 return Err(io::Error::last_os_error());
1500             }
1501         }
1502 
1503         if valid.intersects(SetattrValid::UID | SetattrValid::GID) {
1504             let uid = if valid.contains(SetattrValid::UID) {
1505                 attr.st_uid
1506             } else {
1507                 // Cannot use -1 here because these are unsigned values.
1508                 ::std::u32::MAX
1509             };
1510             let gid = if valid.contains(SetattrValid::GID) {
1511                 attr.st_gid
1512             } else {
1513                 // Cannot use -1 here because these are unsigned values.
1514                 ::std::u32::MAX
1515             };
1516 
1517             // Safe because this is a constant value and a valid C string.
1518             let empty = unsafe { CStr::from_bytes_with_nul_unchecked(EMPTY_CSTR) };
1519 
1520             // Safe because this doesn't modify any memory and we check the return value.
1521             let res = unsafe {
1522                 libc::fchownat(
1523                     inode_data.as_raw_descriptor(),
1524                     empty.as_ptr(),
1525                     uid,
1526                     gid,
1527                     libc::AT_EMPTY_PATH | libc::AT_SYMLINK_NOFOLLOW,
1528                 )
1529             };
1530             if res < 0 {
1531                 return Err(io::Error::last_os_error());
1532             }
1533         }
1534 
1535         if valid.contains(SetattrValid::SIZE) {
1536             // Safe because this doesn't modify any memory and we check the return value.
1537             let res = match data {
1538                 Data::Handle(_, fd) => unsafe { libc::ftruncate64(fd, attr.st_size) },
1539                 _ => {
1540                     // There is no `ftruncateat` so we need to get a new fd and truncate it.
1541                     let f = self.open_inode(&inode_data, libc::O_NONBLOCK | libc::O_RDWR)?;
1542                     unsafe { libc::ftruncate64(f.as_raw_descriptor(), attr.st_size) }
1543                 }
1544             };
1545             if res < 0 {
1546                 return Err(io::Error::last_os_error());
1547             }
1548         }
1549 
1550         if valid.intersects(SetattrValid::ATIME | SetattrValid::MTIME) {
1551             let mut tvs = [
1552                 libc::timespec {
1553                     tv_sec: 0,
1554                     tv_nsec: libc::UTIME_OMIT,
1555                 },
1556                 libc::timespec {
1557                     tv_sec: 0,
1558                     tv_nsec: libc::UTIME_OMIT,
1559                 },
1560             ];
1561 
1562             if valid.contains(SetattrValid::ATIME_NOW) {
1563                 tvs[0].tv_nsec = libc::UTIME_NOW;
1564             } else if valid.contains(SetattrValid::ATIME) {
1565                 tvs[0].tv_sec = attr.st_atime;
1566                 tvs[0].tv_nsec = attr.st_atime_nsec;
1567             }
1568 
1569             if valid.contains(SetattrValid::MTIME_NOW) {
1570                 tvs[1].tv_nsec = libc::UTIME_NOW;
1571             } else if valid.contains(SetattrValid::MTIME) {
1572                 tvs[1].tv_sec = attr.st_mtime;
1573                 tvs[1].tv_nsec = attr.st_mtime_nsec;
1574             }
1575 
1576             // Safe because this doesn't modify any memory and we check the return value.
1577             let res = match data {
1578                 Data::Handle(_, fd) => unsafe { libc::futimens(fd, tvs.as_ptr()) },
1579                 Data::ProcPath(ref p) => unsafe {
1580                     libc::utimensat(self.proc.as_raw_descriptor(), p.as_ptr(), tvs.as_ptr(), 0)
1581                 },
1582             };
1583             if res < 0 {
1584                 return Err(io::Error::last_os_error());
1585             }
1586         }
1587 
1588         self.do_getattr(&inode_data)
1589     }
1590 
rename( &self, _ctx: Context, olddir: Inode, oldname: &CStr, newdir: Inode, newname: &CStr, flags: u32, ) -> io::Result<()>1591     fn rename(
1592         &self,
1593         _ctx: Context,
1594         olddir: Inode,
1595         oldname: &CStr,
1596         newdir: Inode,
1597         newname: &CStr,
1598         flags: u32,
1599     ) -> io::Result<()> {
1600         let old_inode = self.find_inode(olddir)?;
1601         let new_inode = self.find_inode(newdir)?;
1602 
1603         // Safe because this doesn't modify any memory and we check the return value.
1604         // TODO: Switch to libc::renameat2 once https://github.com/rust-lang/libc/pull/1508 lands
1605         // and we have glibc 2.28.
1606         let res = unsafe {
1607             libc::syscall(
1608                 libc::SYS_renameat2,
1609                 old_inode.as_raw_descriptor(),
1610                 oldname.as_ptr(),
1611                 new_inode.as_raw_descriptor(),
1612                 newname.as_ptr(),
1613                 flags,
1614             )
1615         };
1616         if res == 0 {
1617             Ok(())
1618         } else {
1619             Err(io::Error::last_os_error())
1620         }
1621     }
1622 
mknod( &self, ctx: Context, parent: Inode, name: &CStr, mode: u32, rdev: u32, umask: u32, ) -> io::Result<Entry>1623     fn mknod(
1624         &self,
1625         ctx: Context,
1626         parent: Inode,
1627         name: &CStr,
1628         mode: u32,
1629         rdev: u32,
1630         umask: u32,
1631     ) -> io::Result<Entry> {
1632         let data = self.find_inode(parent)?;
1633 
1634         let (_uid, _gid) = set_creds(ctx.uid, ctx.gid)?;
1635 
1636         let res = {
1637             let mut um = self.umask.lock();
1638             let _scoped_umask = um.set(umask);
1639 
1640             // Safe because this doesn't modify any memory and we check the return value.
1641             unsafe {
1642                 libc::mknodat(
1643                     data.as_raw_descriptor(),
1644                     name.as_ptr(),
1645                     mode as libc::mode_t,
1646                     rdev as libc::dev_t,
1647                 )
1648             }
1649         };
1650 
1651         if res < 0 {
1652             Err(io::Error::last_os_error())
1653         } else {
1654             self.do_lookup(&data, name)
1655         }
1656     }
1657 
link( &self, _ctx: Context, inode: Inode, newparent: Inode, newname: &CStr, ) -> io::Result<Entry>1658     fn link(
1659         &self,
1660         _ctx: Context,
1661         inode: Inode,
1662         newparent: Inode,
1663         newname: &CStr,
1664     ) -> io::Result<Entry> {
1665         let data = self.find_inode(inode)?;
1666         let new_inode = self.find_inode(newparent)?;
1667 
1668         let path = CString::new(format!("self/fd/{}", data.as_raw_descriptor()))
1669             .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
1670 
1671         // Safe because this doesn't modify any memory and we check the return value.
1672         let res = unsafe {
1673             libc::linkat(
1674                 self.proc.as_raw_descriptor(),
1675                 path.as_ptr(),
1676                 new_inode.as_raw_descriptor(),
1677                 newname.as_ptr(),
1678                 libc::AT_SYMLINK_FOLLOW,
1679             )
1680         };
1681         if res == 0 {
1682             self.do_lookup(&new_inode, newname)
1683         } else {
1684             Err(io::Error::last_os_error())
1685         }
1686     }
1687 
symlink( &self, ctx: Context, linkname: &CStr, parent: Inode, name: &CStr, ) -> io::Result<Entry>1688     fn symlink(
1689         &self,
1690         ctx: Context,
1691         linkname: &CStr,
1692         parent: Inode,
1693         name: &CStr,
1694     ) -> io::Result<Entry> {
1695         let data = self.find_inode(parent)?;
1696 
1697         let (_uid, _gid) = set_creds(ctx.uid, ctx.gid)?;
1698 
1699         // Safe because this doesn't modify any memory and we check the return value.
1700         let res =
1701             unsafe { libc::symlinkat(linkname.as_ptr(), data.as_raw_descriptor(), name.as_ptr()) };
1702         if res == 0 {
1703             self.do_lookup(&data, name)
1704         } else {
1705             Err(io::Error::last_os_error())
1706         }
1707     }
1708 
readlink(&self, _ctx: Context, inode: Inode) -> io::Result<Vec<u8>>1709     fn readlink(&self, _ctx: Context, inode: Inode) -> io::Result<Vec<u8>> {
1710         let data = self.find_inode(inode)?;
1711 
1712         let mut buf = vec![0; libc::PATH_MAX as usize];
1713 
1714         // Safe because this is a constant value and a valid C string.
1715         let empty = unsafe { CStr::from_bytes_with_nul_unchecked(EMPTY_CSTR) };
1716 
1717         // Safe because this will only modify the contents of `buf` and we check the return value.
1718         let res = unsafe {
1719             libc::readlinkat(
1720                 data.as_raw_descriptor(),
1721                 empty.as_ptr(),
1722                 buf.as_mut_ptr() as *mut libc::c_char,
1723                 buf.len(),
1724             )
1725         };
1726         if res < 0 {
1727             return Err(io::Error::last_os_error());
1728         }
1729 
1730         buf.resize(res as usize, 0);
1731         Ok(buf)
1732     }
1733 
flush( &self, _ctx: Context, inode: Inode, handle: Handle, _lock_owner: u64, ) -> io::Result<()>1734     fn flush(
1735         &self,
1736         _ctx: Context,
1737         inode: Inode,
1738         handle: Handle,
1739         _lock_owner: u64,
1740     ) -> io::Result<()> {
1741         let data: Arc<dyn AsRawDescriptor> = if self.zero_message_open.load(Ordering::Relaxed) {
1742             self.find_inode(inode)?
1743         } else {
1744             self.find_handle(handle, inode)?
1745         };
1746 
1747         // Since this method is called whenever an fd is closed in the client, we can emulate that
1748         // behavior by doing the same thing (dup-ing the fd and then immediately closing it). Safe
1749         // because this doesn't modify any memory and we check the return values.
1750         unsafe {
1751             let newfd = libc::fcntl(data.as_raw_descriptor(), libc::F_DUPFD_CLOEXEC, 0);
1752 
1753             if newfd < 0 {
1754                 return Err(io::Error::last_os_error());
1755             }
1756 
1757             if libc::close(newfd) < 0 {
1758                 Err(io::Error::last_os_error())
1759             } else {
1760                 Ok(())
1761             }
1762         }
1763     }
1764 
fsync(&self, _ctx: Context, inode: Inode, datasync: bool, handle: Handle) -> io::Result<()>1765     fn fsync(&self, _ctx: Context, inode: Inode, datasync: bool, handle: Handle) -> io::Result<()> {
1766         if self.zero_message_open.load(Ordering::Relaxed) {
1767             let data = self.find_inode(inode)?;
1768             self.do_fsync(&*data, datasync)
1769         } else {
1770             let data = self.find_handle(handle, inode)?;
1771 
1772             let file = data.file.lock();
1773             self.do_fsync(&*file, datasync)
1774         }
1775     }
1776 
fsyncdir( &self, _ctx: Context, inode: Inode, datasync: bool, handle: Handle, ) -> io::Result<()>1777     fn fsyncdir(
1778         &self,
1779         _ctx: Context,
1780         inode: Inode,
1781         datasync: bool,
1782         handle: Handle,
1783     ) -> io::Result<()> {
1784         if self.zero_message_opendir.load(Ordering::Relaxed) {
1785             let data = self.find_inode(inode)?;
1786             self.do_fsync(&*data, datasync)
1787         } else {
1788             let data = self.find_handle(handle, inode)?;
1789 
1790             let file = data.file.lock();
1791             self.do_fsync(&*file, datasync)
1792         }
1793     }
1794 
access(&self, ctx: Context, inode: Inode, mask: u32) -> io::Result<()>1795     fn access(&self, ctx: Context, inode: Inode, mask: u32) -> io::Result<()> {
1796         let data = self.find_inode(inode)?;
1797 
1798         let st = stat(&*data)?;
1799         let mode = mask as i32 & (libc::R_OK | libc::W_OK | libc::X_OK);
1800 
1801         if mode == libc::F_OK {
1802             // The file exists since we were able to call `stat(2)` on it.
1803             return Ok(());
1804         }
1805 
1806         if (mode & libc::R_OK) != 0 {
1807             if ctx.uid != 0
1808                 && (st.st_uid != ctx.uid || st.st_mode & 0o400 == 0)
1809                 && (st.st_gid != ctx.gid || st.st_mode & 0o040 == 0)
1810                 && st.st_mode & 0o004 == 0
1811             {
1812                 return Err(io::Error::from_raw_os_error(libc::EACCES));
1813             }
1814         }
1815 
1816         if (mode & libc::W_OK) != 0 {
1817             if ctx.uid != 0
1818                 && (st.st_uid != ctx.uid || st.st_mode & 0o200 == 0)
1819                 && (st.st_gid != ctx.gid || st.st_mode & 0o020 == 0)
1820                 && st.st_mode & 0o002 == 0
1821             {
1822                 return Err(io::Error::from_raw_os_error(libc::EACCES));
1823             }
1824         }
1825 
1826         // root can only execute something if it is executable by one of the owner, the group, or
1827         // everyone.
1828         if (mode & libc::X_OK) != 0 {
1829             if (ctx.uid != 0 || st.st_mode & 0o111 == 0)
1830                 && (st.st_uid != ctx.uid || st.st_mode & 0o100 == 0)
1831                 && (st.st_gid != ctx.gid || st.st_mode & 0o010 == 0)
1832                 && st.st_mode & 0o001 == 0
1833             {
1834                 return Err(io::Error::from_raw_os_error(libc::EACCES));
1835             }
1836         }
1837 
1838         Ok(())
1839     }
1840 
setxattr( &self, _ctx: Context, inode: Inode, name: &CStr, value: &[u8], flags: u32, ) -> io::Result<()>1841     fn setxattr(
1842         &self,
1843         _ctx: Context,
1844         inode: Inode,
1845         name: &CStr,
1846         value: &[u8],
1847         flags: u32,
1848     ) -> io::Result<()> {
1849         // We can't allow the VM to set this xattr because an unprivileged process may use it to set
1850         // a privileged xattr.
1851         if self.cfg.rewrite_security_xattrs && name.to_bytes().starts_with(USER_VIRTIOFS_XATTR) {
1852             return Err(io::Error::from_raw_os_error(libc::EPERM));
1853         }
1854 
1855         let data = self.find_inode(inode)?;
1856         let name = self.rewrite_xattr_name(name);
1857 
1858         let res = if data.filetype == FileType::Other {
1859             // For non-regular files and directories, we cannot open the fd normally. Instead we
1860             // emulate an _at syscall by changing the CWD to /proc, running the path based syscall,
1861             // and then setting the CWD back to the root directory.
1862             let path = CString::new(format!("self/fd/{}", data.as_raw_descriptor()))
1863                 .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
1864 
1865             // Safe because this doesn't modify any memory and we check the return value.
1866             self.with_proc_chdir(|| unsafe {
1867                 libc::setxattr(
1868                     path.as_ptr(),
1869                     name.as_ptr(),
1870                     value.as_ptr() as *const libc::c_void,
1871                     value.len() as libc::size_t,
1872                     flags as c_int,
1873                 )
1874             })
1875         } else {
1876             // For regular files and directories, we can just use fsetxattr. Safe because this
1877             // doesn't modify any memory and we check the return value.
1878             unsafe {
1879                 libc::fsetxattr(
1880                     data.as_raw_descriptor(),
1881                     name.as_ptr(),
1882                     value.as_ptr() as *const libc::c_void,
1883                     value.len() as libc::size_t,
1884                     flags as c_int,
1885                 )
1886             }
1887         };
1888 
1889         if res < 0 {
1890             Err(io::Error::last_os_error())
1891         } else {
1892             Ok(())
1893         }
1894     }
1895 
getxattr( &self, _ctx: Context, inode: Inode, name: &CStr, size: u32, ) -> io::Result<GetxattrReply>1896     fn getxattr(
1897         &self,
1898         _ctx: Context,
1899         inode: Inode,
1900         name: &CStr,
1901         size: u32,
1902     ) -> io::Result<GetxattrReply> {
1903         // We don't allow the VM to set this xattr so we also pretend there is no value associated
1904         // with it.
1905         if self.cfg.rewrite_security_xattrs && name.to_bytes().starts_with(USER_VIRTIOFS_XATTR) {
1906             return Err(io::Error::from_raw_os_error(libc::ENODATA));
1907         }
1908 
1909         let data = self.find_inode(inode)?;
1910         let name = self.rewrite_xattr_name(name);
1911         let mut buf = vec![0u8; size as usize];
1912 
1913         // Safe because this will only modify the contents of `buf`.
1914         let res = self.do_getxattr(&data, &name, &mut buf[..])?;
1915         if size == 0 {
1916             Ok(GetxattrReply::Count(res as u32))
1917         } else {
1918             buf.truncate(res as usize);
1919             Ok(GetxattrReply::Value(buf))
1920         }
1921     }
1922 
listxattr(&self, _ctx: Context, inode: Inode, size: u32) -> io::Result<ListxattrReply>1923     fn listxattr(&self, _ctx: Context, inode: Inode, size: u32) -> io::Result<ListxattrReply> {
1924         let data = self.find_inode(inode)?;
1925 
1926         let mut buf = vec![0u8; size as usize];
1927 
1928         let res = if data.filetype == FileType::Other {
1929             // For non-regular files and directories, we cannot open the fd normally. Instead we
1930             // emulate an _at syscall by changing the CWD to /proc, running the path based syscall,
1931             // and then setting the CWD back to the root directory.
1932             let path = CString::new(format!("self/fd/{}", data.as_raw_descriptor()))
1933                 .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
1934 
1935             // Safe because this will only modify `buf` and we check the return value.
1936             self.with_proc_chdir(|| unsafe {
1937                 libc::listxattr(
1938                     path.as_ptr(),
1939                     buf.as_mut_ptr() as *mut libc::c_char,
1940                     buf.len() as libc::size_t,
1941                 )
1942             })
1943         } else {
1944             // For regular files and directories, we can just flistxattr. Safe because this will only
1945             // write to `buf` and we check the return value.
1946             unsafe {
1947                 libc::flistxattr(
1948                     data.as_raw_descriptor(),
1949                     buf.as_mut_ptr() as *mut libc::c_char,
1950                     buf.len() as libc::size_t,
1951                 )
1952             }
1953         };
1954 
1955         if res < 0 {
1956             return Err(io::Error::last_os_error());
1957         }
1958 
1959         if size == 0 {
1960             Ok(ListxattrReply::Count(res as u32))
1961         } else {
1962             buf.truncate(res as usize);
1963 
1964             if self.cfg.rewrite_security_xattrs {
1965                 strip_xattr_prefix(&mut buf);
1966             }
1967             Ok(ListxattrReply::Names(buf))
1968         }
1969     }
1970 
removexattr(&self, _ctx: Context, inode: Inode, name: &CStr) -> io::Result<()>1971     fn removexattr(&self, _ctx: Context, inode: Inode, name: &CStr) -> io::Result<()> {
1972         // We don't allow the VM to set this xattr so we also pretend there is no value associated
1973         // with it.
1974         if self.cfg.rewrite_security_xattrs && name.to_bytes().starts_with(USER_VIRTIOFS_XATTR) {
1975             return Err(io::Error::from_raw_os_error(libc::ENODATA));
1976         }
1977 
1978         let data = self.find_inode(inode)?;
1979         let name = self.rewrite_xattr_name(name);
1980 
1981         let res = if data.filetype == FileType::Other {
1982             // For non-regular files and directories, we cannot open the fd normally. Instead we
1983             // emulate an _at syscall by changing the CWD to /proc, running the path based syscall,
1984             // and then setting the CWD back to the root directory.
1985             let path = CString::new(format!("self/fd/{}", data.as_raw_descriptor()))
1986                 .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
1987 
1988             // Safe because this doesn't modify any memory and we check the return value.
1989             self.with_proc_chdir(|| unsafe { libc::removexattr(path.as_ptr(), name.as_ptr()) })
1990         } else {
1991             // For regular files and directories, we can just use fremovexattr. Safe because this
1992             // doesn't modify any memory and we check the return value.
1993             unsafe { libc::fremovexattr(data.as_raw_descriptor(), name.as_ptr()) }
1994         };
1995 
1996         if res == 0 {
1997             Ok(())
1998         } else {
1999             Err(io::Error::last_os_error())
2000         }
2001     }
2002 
fallocate( &self, _ctx: Context, inode: Inode, handle: Handle, mode: u32, offset: u64, length: u64, ) -> io::Result<()>2003     fn fallocate(
2004         &self,
2005         _ctx: Context,
2006         inode: Inode,
2007         handle: Handle,
2008         mode: u32,
2009         offset: u64,
2010         length: u64,
2011     ) -> io::Result<()> {
2012         let data: Arc<dyn AsRawDescriptor> = if self.zero_message_open.load(Ordering::Relaxed) {
2013             let data = self.find_inode(inode)?;
2014 
2015             {
2016                 // fallocate needs a writable fd
2017                 let mut file = data.file.lock();
2018                 let mut flags = file.1;
2019                 match flags & libc::O_ACCMODE {
2020                     libc::O_RDONLY => {
2021                         flags &= !libc::O_RDONLY;
2022                         flags |= libc::O_RDWR;
2023 
2024                         // We need to get a writable handle for this file.
2025                         let newfile = self.open_fd(file.0.as_raw_descriptor(), libc::O_RDWR)?;
2026                         *file = (newfile, flags);
2027                     }
2028                     libc::O_WRONLY | libc::O_RDWR => {}
2029                     _ => panic!("Unexpected flags: {:#x}", flags),
2030                 }
2031             }
2032 
2033             data
2034         } else {
2035             self.find_handle(handle, inode)?
2036         };
2037 
2038         let fd = data.as_raw_descriptor();
2039         // Safe because this doesn't modify any memory and we check the return value.
2040         let res = unsafe {
2041             libc::fallocate64(
2042                 fd,
2043                 mode as libc::c_int,
2044                 offset as libc::off64_t,
2045                 length as libc::off64_t,
2046             )
2047         };
2048         if res == 0 {
2049             Ok(())
2050         } else {
2051             Err(io::Error::last_os_error())
2052         }
2053     }
2054 
ioctl<R: io::Read>( &self, _ctx: Context, inode: Inode, handle: Handle, _flags: IoctlFlags, cmd: u32, _arg: u64, in_size: u32, out_size: u32, r: R, ) -> io::Result<IoctlReply>2055     fn ioctl<R: io::Read>(
2056         &self,
2057         _ctx: Context,
2058         inode: Inode,
2059         handle: Handle,
2060         _flags: IoctlFlags,
2061         cmd: u32,
2062         _arg: u64,
2063         in_size: u32,
2064         out_size: u32,
2065         r: R,
2066     ) -> io::Result<IoctlReply> {
2067         const GET_ENCRYPTION_POLICY_EX: u32 = FS_IOC_GET_ENCRYPTION_POLICY_EX() as u32;
2068         const GET_FSXATTR: u32 = FS_IOC_FSGETXATTR() as u32;
2069         const SET_FSXATTR: u32 = FS_IOC_FSSETXATTR() as u32;
2070         const GET_FLAGS32: u32 = FS_IOC32_GETFLAGS() as u32;
2071         const SET_FLAGS32: u32 = FS_IOC32_SETFLAGS() as u32;
2072         const GET_FLAGS64: u32 = FS_IOC64_GETFLAGS() as u32;
2073         const SET_FLAGS64: u32 = FS_IOC64_SETFLAGS() as u32;
2074 
2075         match cmd {
2076             GET_ENCRYPTION_POLICY_EX => self.get_encryption_policy_ex(inode, handle, r),
2077             GET_FSXATTR => {
2078                 if out_size < size_of::<fsxattr>() as u32 {
2079                     Err(io::Error::from_raw_os_error(libc::ENOMEM))
2080                 } else {
2081                     self.get_fsxattr(inode, handle)
2082                 }
2083             }
2084             SET_FSXATTR => {
2085                 if in_size < size_of::<fsxattr>() as u32 {
2086                     Err(io::Error::from_raw_os_error(libc::EINVAL))
2087                 } else {
2088                     self.set_fsxattr(inode, handle, r)
2089                 }
2090             }
2091             GET_FLAGS32 | GET_FLAGS64 => {
2092                 if out_size < size_of::<c_int>() as u32 {
2093                     Err(io::Error::from_raw_os_error(libc::ENOMEM))
2094                 } else {
2095                     self.get_flags(inode, handle)
2096                 }
2097             }
2098             SET_FLAGS32 | SET_FLAGS64 => {
2099                 if in_size < size_of::<c_int>() as u32 {
2100                     Err(io::Error::from_raw_os_error(libc::ENOMEM))
2101                 } else {
2102                     self.set_flags(inode, handle, r)
2103                 }
2104             }
2105             _ => Err(io::Error::from_raw_os_error(libc::ENOTTY)),
2106         }
2107     }
2108 
copy_file_range( &self, ctx: Context, inode_src: Inode, handle_src: Handle, offset_src: u64, inode_dst: Inode, handle_dst: Handle, offset_dst: u64, length: u64, flags: u64, ) -> io::Result<usize>2109     fn copy_file_range(
2110         &self,
2111         ctx: Context,
2112         inode_src: Inode,
2113         handle_src: Handle,
2114         offset_src: u64,
2115         inode_dst: Inode,
2116         handle_dst: Handle,
2117         offset_dst: u64,
2118         length: u64,
2119         flags: u64,
2120     ) -> io::Result<usize> {
2121         // We need to change credentials during a write so that the kernel will remove setuid or
2122         // setgid bits from the file if it was written to by someone other than the owner.
2123         let (_uid, _gid) = set_creds(ctx.uid, ctx.gid)?;
2124         let (src_data, dst_data): (Arc<dyn AsRawDescriptor>, Arc<dyn AsRawDescriptor>) =
2125             if self.zero_message_open.load(Ordering::Relaxed) {
2126                 (self.find_inode(inode_src)?, self.find_inode(inode_dst)?)
2127             } else {
2128                 (
2129                     self.find_handle(handle_src, inode_src)?,
2130                     self.find_handle(handle_dst, inode_dst)?,
2131                 )
2132             };
2133 
2134         let src = src_data.as_raw_descriptor();
2135         let dst = dst_data.as_raw_descriptor();
2136 
2137         let res = unsafe {
2138             libc::syscall(
2139                 libc::SYS_copy_file_range,
2140                 src,
2141                 &offset_src,
2142                 dst,
2143                 &offset_dst,
2144                 length,
2145                 flags,
2146             )
2147         };
2148 
2149         if res >= 0 {
2150             Ok(res as usize)
2151         } else {
2152             Err(io::Error::last_os_error())
2153         }
2154     }
2155 
set_up_mapping<M: Mapper>( &self, _ctx: Context, inode: Self::Inode, _handle: Self::Handle, file_offset: u64, mem_offset: u64, size: usize, prot: u32, mapper: M, ) -> io::Result<()>2156     fn set_up_mapping<M: Mapper>(
2157         &self,
2158         _ctx: Context,
2159         inode: Self::Inode,
2160         _handle: Self::Handle,
2161         file_offset: u64,
2162         mem_offset: u64,
2163         size: usize,
2164         prot: u32,
2165         mapper: M,
2166     ) -> io::Result<()> {
2167         let read = prot & libc::PROT_READ as u32 != 0;
2168         let write = prot & libc::PROT_WRITE as u32 != 0;
2169         let mmap_flags = match (read, write) {
2170             (true, true) => libc::O_RDWR,
2171             (true, false) => libc::O_RDONLY,
2172             (false, true) => libc::O_RDWR, // mmap always requires an fd opened for reading.
2173             (false, false) => return Err(io::Error::from_raw_os_error(libc::EINVAL)),
2174         };
2175 
2176         let data = self.find_inode(inode)?;
2177 
2178         if self.zero_message_open.load(Ordering::Relaxed) {
2179             let mut file = data.file.lock();
2180             let mut open_flags = file.1;
2181             match (mmap_flags, open_flags & libc::O_ACCMODE) {
2182                 (libc::O_RDONLY, libc::O_WRONLY)
2183                 | (libc::O_RDWR, libc::O_RDONLY)
2184                 | (libc::O_RDWR, libc::O_WRONLY) => {
2185                     // We have a read-only or write-only fd and we need to upgrade it.
2186                     open_flags &= !libc::O_ACCMODE;
2187                     open_flags |= libc::O_RDWR;
2188 
2189                     let newfile = self.open_fd(file.0.as_raw_descriptor(), libc::O_RDWR)?;
2190                     *file = (newfile, open_flags);
2191                 }
2192                 (libc::O_RDONLY, libc::O_RDONLY)
2193                 | (libc::O_RDONLY, libc::O_RDWR)
2194                 | (libc::O_RDWR, libc::O_RDWR) => {}
2195                 (m, o) => panic!(
2196                     "Unexpected combination of access flags: ({:#x}, {:#x})",
2197                     m, o
2198                 ),
2199             }
2200             mapper.map(mem_offset, size, &file.0, file_offset, prot)
2201         } else {
2202             let file = self.open_inode(&data, mmap_flags | libc::O_NONBLOCK)?;
2203             mapper.map(mem_offset, size, &file, file_offset, prot)
2204         }
2205     }
2206 
remove_mapping<M: Mapper>(&self, msgs: &[RemoveMappingOne], mapper: M) -> io::Result<()>2207     fn remove_mapping<M: Mapper>(&self, msgs: &[RemoveMappingOne], mapper: M) -> io::Result<()> {
2208         for RemoveMappingOne { moffset, len } in msgs {
2209             mapper.unmap(*moffset, *len)?;
2210         }
2211         Ok(())
2212     }
2213 }
2214 
2215 #[cfg(test)]
2216 mod tests {
2217     use super::*;
2218 
2219     #[test]
rewrite_xattr_names()2220     fn rewrite_xattr_names() {
2221         let cfg = Config {
2222             rewrite_security_xattrs: true,
2223             ..Default::default()
2224         };
2225 
2226         let p = PassthroughFs::new(cfg).expect("Failed to create PassthroughFs");
2227 
2228         // Selinux shouldn't get overwritten.
2229         let selinux = unsafe { CStr::from_bytes_with_nul_unchecked(b"security.selinux\0") };
2230         assert_eq!(p.rewrite_xattr_name(selinux).to_bytes(), selinux.to_bytes());
2231 
2232         // user, trusted, and system should not be changed either.
2233         let user = unsafe { CStr::from_bytes_with_nul_unchecked(b"user.foobar\0") };
2234         assert_eq!(p.rewrite_xattr_name(user).to_bytes(), user.to_bytes());
2235         let trusted = unsafe { CStr::from_bytes_with_nul_unchecked(b"trusted.foobar\0") };
2236         assert_eq!(p.rewrite_xattr_name(trusted).to_bytes(), trusted.to_bytes());
2237         let system = unsafe { CStr::from_bytes_with_nul_unchecked(b"system.foobar\0") };
2238         assert_eq!(p.rewrite_xattr_name(system).to_bytes(), system.to_bytes());
2239 
2240         // sehash should be re-written.
2241         let sehash = unsafe { CStr::from_bytes_with_nul_unchecked(b"security.sehash\0") };
2242         assert_eq!(
2243             p.rewrite_xattr_name(sehash).to_bytes(),
2244             b"user.virtiofs.security.sehash"
2245         );
2246     }
2247 
2248     #[test]
strip_xattr_names()2249     fn strip_xattr_names() {
2250         let only_nuls = b"\0\0\0\0\0";
2251         let mut actual = only_nuls.to_vec();
2252         strip_xattr_prefix(&mut actual);
2253         assert_eq!(&actual[..], &only_nuls[..]);
2254 
2255         let no_nuls = b"security.sehashuser.virtiofs";
2256         let mut actual = no_nuls.to_vec();
2257         strip_xattr_prefix(&mut actual);
2258         assert_eq!(&actual[..], &no_nuls[..]);
2259 
2260         let empty = b"";
2261         let mut actual = empty.to_vec();
2262         strip_xattr_prefix(&mut actual);
2263         assert_eq!(&actual[..], &empty[..]);
2264 
2265         let no_strippable_names = b"security.selinux\0user.foobar\0system.test\0";
2266         let mut actual = no_strippable_names.to_vec();
2267         strip_xattr_prefix(&mut actual);
2268         assert_eq!(&actual[..], &no_strippable_names[..]);
2269 
2270         let only_strippable_names = b"user.virtiofs.security.sehash\0user.virtiofs.security.wtf\0";
2271         let mut actual = only_strippable_names.to_vec();
2272         strip_xattr_prefix(&mut actual);
2273         assert_eq!(&actual[..], b"security.sehash\0security.wtf\0");
2274 
2275         let mixed_names = b"user.virtiofs.security.sehash\0security.selinux\0user.virtiofs.security.wtf\0user.foobar\0";
2276         let mut actual = mixed_names.to_vec();
2277         strip_xattr_prefix(&mut actual);
2278         let expected = b"security.sehash\0security.selinux\0security.wtf\0user.foobar\0";
2279         assert_eq!(&actual[..], &expected[..]);
2280 
2281         let no_nul_with_prefix = b"user.virtiofs.security.sehash";
2282         let mut actual = no_nul_with_prefix.to_vec();
2283         strip_xattr_prefix(&mut actual);
2284         assert_eq!(&actual[..], b"security.sehash");
2285     }
2286 }
2287