• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2019 The ChromiumOS Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 use std::borrow::Cow;
6 use std::cell::RefCell;
7 use std::cmp;
8 use std::collections::btree_map;
9 use std::collections::BTreeMap;
10 use std::ffi::CStr;
11 use std::ffi::CString;
12 use std::fs::File;
13 use std::io;
14 use std::mem;
15 use std::mem::size_of;
16 use std::mem::MaybeUninit;
17 use std::os::raw::c_int;
18 use std::os::raw::c_long;
19 use std::ptr;
20 use std::ptr::addr_of;
21 use std::ptr::addr_of_mut;
22 use std::sync::atomic::AtomicBool;
23 use std::sync::atomic::AtomicU64;
24 use std::sync::atomic::Ordering;
25 use std::sync::Arc;
26 use std::sync::MutexGuard;
27 use std::time::Duration;
28 
29 use base::error;
30 use base::ioctl_ior_nr;
31 use base::ioctl_iow_nr;
32 use base::ioctl_iowr_nr;
33 use base::ioctl_with_mut_ptr;
34 use base::ioctl_with_ptr;
35 use base::syscall;
36 use base::unix::FileFlags;
37 use base::warn;
38 use base::AsRawDescriptor;
39 use base::FromRawDescriptor;
40 use base::Protection;
41 use base::RawDescriptor;
42 use fuse::filesystem::Context;
43 use fuse::filesystem::DirectoryIterator;
44 use fuse::filesystem::Entry;
45 use fuse::filesystem::FileSystem;
46 use fuse::filesystem::FsOptions;
47 use fuse::filesystem::GetxattrReply;
48 use fuse::filesystem::IoctlFlags;
49 use fuse::filesystem::IoctlReply;
50 use fuse::filesystem::ListxattrReply;
51 use fuse::filesystem::OpenOptions;
52 use fuse::filesystem::RemoveMappingOne;
53 use fuse::filesystem::SetattrValid;
54 use fuse::filesystem::ZeroCopyReader;
55 use fuse::filesystem::ZeroCopyWriter;
56 use fuse::filesystem::ROOT_ID;
57 use fuse::sys::WRITE_KILL_PRIV;
58 use fuse::Mapper;
59 #[cfg(feature = "arc_quota")]
60 use protobuf::Message;
61 use sync::Mutex;
62 #[cfg(feature = "arc_quota")]
63 use system_api::client::OrgChromiumSpaced;
64 #[cfg(feature = "arc_quota")]
65 use system_api::spaced::SetProjectIdReply;
66 #[cfg(feature = "arc_quota")]
67 use system_api::spaced::SetProjectInheritanceFlagReply;
68 use zerocopy::AsBytes;
69 use zerocopy::FromBytes;
70 use zerocopy::FromZeroes;
71 
72 use crate::virtio::fs::caps::Capability;
73 use crate::virtio::fs::caps::Caps;
74 use crate::virtio::fs::caps::Set as CapSet;
75 use crate::virtio::fs::caps::Value as CapValue;
76 use crate::virtio::fs::config::CachePolicy;
77 use crate::virtio::fs::config::Config;
78 use crate::virtio::fs::expiring_map::ExpiringMap;
79 use crate::virtio::fs::multikey::MultikeyBTreeMap;
80 use crate::virtio::fs::read_dir::ReadDir;
81 
82 const EMPTY_CSTR: &[u8] = b"\0";
83 const ROOT_CSTR: &[u8] = b"/\0";
84 const PROC_CSTR: &[u8] = b"/proc\0";
85 const UNLABELED_CSTR: &[u8] = b"unlabeled\0";
86 
87 const USER_VIRTIOFS_XATTR: &[u8] = b"user.virtiofs.";
88 const SECURITY_XATTR: &[u8] = b"security.";
89 const SELINUX_XATTR: &[u8] = b"security.selinux";
90 
91 const FSCRYPT_KEY_DESCRIPTOR_SIZE: usize = 8;
92 const FSCRYPT_KEY_IDENTIFIER_SIZE: usize = 16;
93 
94 #[cfg(feature = "arc_quota")]
95 const FS_PROJINHERIT_FL: c_int = 0x20000000;
96 
97 // 25 seconds is the default timeout for dbus-send.
98 #[cfg(feature = "arc_quota")]
99 const DEFAULT_DBUS_TIMEOUT: Duration = Duration::from_secs(25);
100 
101 /// Internal utility wrapper for `cros_tracing::trace_event!()` macro with VirtioFS calls.
102 macro_rules! fs_trace {
103     ($tag:expr, $name:expr, $($arg:expr),+) => {
104         cros_tracing::trace_event!(VirtioFs, $name, $tag, $($arg),*)
105     };
106 }
107 
108 #[repr(C)]
109 #[derive(Clone, Copy, AsBytes, FromZeroes, FromBytes)]
110 struct fscrypt_policy_v1 {
111     _version: u8,
112     _contents_encryption_mode: u8,
113     _filenames_encryption_mode: u8,
114     _flags: u8,
115     _master_key_descriptor: [u8; FSCRYPT_KEY_DESCRIPTOR_SIZE],
116 }
117 
118 #[repr(C)]
119 #[derive(Clone, Copy, AsBytes, FromZeroes, FromBytes)]
120 struct fscrypt_policy_v2 {
121     _version: u8,
122     _contents_encryption_mode: u8,
123     _filenames_encryption_mode: u8,
124     _flags: u8,
125     __reserved: [u8; 4],
126     master_key_identifier: [u8; FSCRYPT_KEY_IDENTIFIER_SIZE],
127 }
128 
129 #[repr(C)]
130 #[derive(Copy, Clone, FromZeroes, FromBytes)]
131 union fscrypt_policy {
132     _version: u8,
133     _v1: fscrypt_policy_v1,
134     _v2: fscrypt_policy_v2,
135 }
136 
137 #[repr(C)]
138 #[derive(Copy, Clone, FromZeroes, FromBytes)]
139 struct fscrypt_get_policy_ex_arg {
140     policy_size: u64,       /* input/output */
141     policy: fscrypt_policy, /* output */
142 }
143 
144 impl From<&fscrypt_get_policy_ex_arg> for &[u8] {
from(value: &fscrypt_get_policy_ex_arg) -> Self145     fn from(value: &fscrypt_get_policy_ex_arg) -> Self {
146         assert!(value.policy_size <= size_of::<fscrypt_policy>() as u64);
147         let data_raw: *const fscrypt_get_policy_ex_arg = value;
148         // SAFETY: the length of the output slice is asserted to be within the struct it points to
149         unsafe {
150             std::slice::from_raw_parts(
151                 data_raw.cast(),
152                 value.policy_size as usize + size_of::<u64>(),
153             )
154         }
155     }
156 }
157 
158 ioctl_iowr_nr!(FS_IOC_GET_ENCRYPTION_POLICY_EX, 'f' as u32, 22, [u8; 9]);
159 
160 #[repr(C)]
161 #[derive(Clone, Copy, AsBytes, FromZeroes, FromBytes)]
162 struct fsxattr {
163     fsx_xflags: u32,     /* xflags field value (get/set) */
164     fsx_extsize: u32,    /* extsize field value (get/set) */
165     fsx_nextents: u32,   /* nextents field value (get) */
166     fsx_projid: u32,     /* project identifier (get/set) */
167     fsx_cowextsize: u32, /* CoW extsize field value (get/set) */
168     fsx_pad: [u8; 8],
169 }
170 
171 ioctl_ior_nr!(FS_IOC_FSGETXATTR, 'X' as u32, 31, fsxattr);
172 ioctl_iow_nr!(FS_IOC_FSSETXATTR, 'X' as u32, 32, fsxattr);
173 
174 ioctl_ior_nr!(FS_IOC_GETFLAGS, 'f' as u32, 1, c_long);
175 ioctl_iow_nr!(FS_IOC_SETFLAGS, 'f' as u32, 2, c_long);
176 
177 ioctl_ior_nr!(FS_IOC32_GETFLAGS, 'f' as u32, 1, u32);
178 ioctl_iow_nr!(FS_IOC32_SETFLAGS, 'f' as u32, 2, u32);
179 
180 ioctl_ior_nr!(FS_IOC64_GETFLAGS, 'f' as u32, 1, u64);
181 ioctl_iow_nr!(FS_IOC64_SETFLAGS, 'f' as u32, 2, u64);
182 
183 #[repr(C)]
184 #[derive(Clone, Copy, AsBytes, FromZeroes, FromBytes)]
185 struct fsverity_enable_arg {
186     _version: u32,
187     _hash_algorithm: u32,
188     _block_size: u32,
189     salt_size: u32,
190     salt_ptr: u64,
191     sig_size: u32,
192     __reserved1: u32,
193     sig_ptr: u64,
194     __reserved2: [u64; 11],
195 }
196 
197 #[repr(C)]
198 #[derive(Clone, Copy, AsBytes, FromZeroes, FromBytes)]
199 struct fsverity_digest {
200     _digest_algorithm: u16,
201     digest_size: u16,
202     // __u8 digest[];
203 }
204 
205 ioctl_iow_nr!(FS_IOC_ENABLE_VERITY, 'f' as u32, 133, fsverity_enable_arg);
206 ioctl_iowr_nr!(FS_IOC_MEASURE_VERITY, 'f' as u32, 134, fsverity_digest);
207 
208 pub type Inode = u64;
209 type Handle = u64;
210 
211 #[derive(Clone, Copy, Debug, PartialOrd, Ord, PartialEq, Eq)]
212 struct InodeAltKey {
213     ino: libc::ino64_t,
214     dev: libc::dev_t,
215 }
216 
217 #[derive(PartialEq, Eq, Debug)]
218 enum FileType {
219     Regular,
220     Directory,
221     Other,
222 }
223 
224 impl From<libc::mode_t> for FileType {
from(mode: libc::mode_t) -> Self225     fn from(mode: libc::mode_t) -> Self {
226         match mode & libc::S_IFMT {
227             libc::S_IFREG => FileType::Regular,
228             libc::S_IFDIR => FileType::Directory,
229             _ => FileType::Other,
230         }
231     }
232 }
233 
234 #[derive(Debug)]
235 struct InodeData {
236     inode: Inode,
237     // (File, open_flags)
238     file: Mutex<(File, libc::c_int)>,
239     refcount: AtomicU64,
240     filetype: FileType,
241     path: String,
242 }
243 
244 impl AsRawDescriptor for InodeData {
as_raw_descriptor(&self) -> RawDescriptor245     fn as_raw_descriptor(&self) -> RawDescriptor {
246         self.file.lock().0.as_raw_descriptor()
247     }
248 }
249 
250 #[derive(Debug)]
251 struct HandleData {
252     inode: Inode,
253     file: Mutex<File>,
254 }
255 
256 impl AsRawDescriptor for HandleData {
as_raw_descriptor(&self) -> RawDescriptor257     fn as_raw_descriptor(&self) -> RawDescriptor {
258         self.file.lock().as_raw_descriptor()
259     }
260 }
261 
262 macro_rules! scoped_cred {
263     ($name:ident, $ty:ty, $syscall_nr:expr) => {
264         #[derive(Debug)]
265         struct $name {
266             old: $ty,
267         }
268 
269         impl $name {
270             // Changes the effective uid/gid of the current thread to `val`. Changes the thread's
271             // credentials back to `old` when the returned struct is dropped.
272             fn new(val: $ty, old: $ty) -> io::Result<Option<$name>> {
273                 if val == old {
274                     // Nothing to do since we already have the correct value.
275                     return Ok(None);
276                 }
277 
278                 // We want credential changes to be per-thread because otherwise
279                 // we might interfere with operations being carried out on other
280                 // threads with different uids/gids.  However, posix requires that
281                 // all threads in a process share the same credentials.  To do this
282                 // libc uses signals to ensure that when one thread changes its
283                 // credentials the other threads do the same thing.
284                 //
285                 // So instead we invoke the syscall directly in order to get around
286                 // this limitation.  Another option is to use the setfsuid and
287                 // setfsgid systems calls.   However since those calls have no way to
288                 // return an error, it's preferable to do this instead.
289 
290                 // SAFETY: this call is safe because it doesn't modify any memory and we
291                 // check the return value.
292                 let res = unsafe { libc::syscall($syscall_nr, -1, val, -1) };
293                 if res == 0 {
294                     Ok(Some($name { old }))
295                 } else {
296                     Err(io::Error::last_os_error())
297                 }
298             }
299         }
300 
301         impl Drop for $name {
302             fn drop(&mut self) {
303                 // SAFETY: trivially safe
304                 let res = unsafe { libc::syscall($syscall_nr, -1, self.old, -1) };
305                 if res < 0 {
306                     error!(
307                         "failed to change credentials back to {}: {}",
308                         self.old,
309                         io::Error::last_os_error(),
310                     );
311                 }
312             }
313         }
314     };
315 }
316 #[cfg(not(target_arch = "arm"))]
317 scoped_cred!(ScopedUid, libc::uid_t, libc::SYS_setresuid);
318 #[cfg(target_arch = "arm")]
319 scoped_cred!(ScopedUid, libc::uid_t, libc::SYS_setresuid32);
320 
321 #[cfg(not(target_arch = "arm"))]
322 scoped_cred!(ScopedGid, libc::gid_t, libc::SYS_setresgid);
323 #[cfg(target_arch = "arm")]
324 scoped_cred!(ScopedGid, libc::gid_t, libc::SYS_setresgid32);
325 
326 #[cfg(not(target_arch = "arm"))]
327 const SYS_GETEUID: libc::c_long = libc::SYS_geteuid;
328 #[cfg(target_arch = "arm")]
329 const SYS_GETEUID: libc::c_long = libc::SYS_geteuid32;
330 
331 #[cfg(not(target_arch = "arm"))]
332 const SYS_GETEGID: libc::c_long = libc::SYS_getegid;
333 #[cfg(target_arch = "arm")]
334 const SYS_GETEGID: libc::c_long = libc::SYS_getegid32;
335 
336 thread_local! {
337     // SAFETY: both calls take no parameters and only return an integer value. The kernel also
338     // guarantees that they can never fail.
339     static THREAD_EUID: libc::uid_t = unsafe { libc::syscall(SYS_GETEUID) as libc::uid_t };
340     // SAFETY: both calls take no parameters and only return an integer value. The kernel also
341     // guarantees that they can never fail.
342     static THREAD_EGID: libc::gid_t = unsafe { libc::syscall(SYS_GETEGID) as libc::gid_t };
343 }
344 
set_creds( uid: libc::uid_t, gid: libc::gid_t, ) -> io::Result<(Option<ScopedUid>, Option<ScopedGid>)>345 fn set_creds(
346     uid: libc::uid_t,
347     gid: libc::gid_t,
348 ) -> io::Result<(Option<ScopedUid>, Option<ScopedGid>)> {
349     let olduid = THREAD_EUID.with(|uid| *uid);
350     let oldgid = THREAD_EGID.with(|gid| *gid);
351 
352     // We have to change the gid before we change the uid because if we change the uid first then we
353     // lose the capability to change the gid.  However changing back can happen in any order.
354     ScopedGid::new(gid, oldgid).and_then(|gid| Ok((ScopedUid::new(uid, olduid)?, gid)))
355 }
356 
357 thread_local!(static THREAD_FSCREATE: RefCell<Option<File>> = RefCell::new(None));
358 
359 // Opens and returns a write-only handle to /proc/thread-self/attr/fscreate. Panics if it fails to
360 // open the file.
open_fscreate(proc: &File) -> File361 fn open_fscreate(proc: &File) -> File {
362     // SAFETY: This string is nul-terminated and does not contain any interior nul bytes
363     let fscreate = unsafe { CStr::from_bytes_with_nul_unchecked(b"thread-self/attr/fscreate\0") };
364 
365     // SAFETY: this doesn't modify any memory and we check the return value.
366     let raw_descriptor = unsafe {
367         libc::openat(
368             proc.as_raw_descriptor(),
369             fscreate.as_ptr(),
370             libc::O_CLOEXEC | libc::O_WRONLY,
371         )
372     };
373 
374     // We don't expect this to fail and we're not in a position to return an error here so just
375     // panic.
376     if raw_descriptor < 0 {
377         panic!(
378             "Failed to open /proc/thread-self/attr/fscreate: {}",
379             io::Error::last_os_error()
380         );
381     }
382 
383     // SAFETY: safe because we just opened this descriptor.
384     unsafe { File::from_raw_descriptor(raw_descriptor) }
385 }
386 
387 struct ScopedSecurityContext;
388 
389 impl ScopedSecurityContext {
new(proc: &File, ctx: &CStr) -> io::Result<ScopedSecurityContext>390     fn new(proc: &File, ctx: &CStr) -> io::Result<ScopedSecurityContext> {
391         THREAD_FSCREATE.with(|thread_fscreate| {
392             let mut fscreate = thread_fscreate.borrow_mut();
393             let file = fscreate.get_or_insert_with(|| open_fscreate(proc));
394             // SAFETY: this doesn't modify any memory and we check the return value.
395             let ret = unsafe {
396                 libc::write(
397                     file.as_raw_descriptor(),
398                     ctx.as_ptr() as *const libc::c_void,
399                     ctx.to_bytes_with_nul().len(),
400                 )
401             };
402             if ret < 0 {
403                 Err(io::Error::last_os_error())
404             } else {
405                 Ok(ScopedSecurityContext)
406             }
407         })
408     }
409 }
410 
411 impl Drop for ScopedSecurityContext {
drop(&mut self)412     fn drop(&mut self) {
413         THREAD_FSCREATE.with(|thread_fscreate| {
414             // expect is safe here because the thread local would have been initialized by the call
415             // to `new` above.
416             let fscreate = thread_fscreate.borrow();
417             let file = fscreate
418                 .as_ref()
419                 .expect("Uninitialized thread-local when dropping ScopedSecurityContext");
420 
421             // SAFETY: this doesn't modify any memory and we check the return value.
422             let ret = unsafe { libc::write(file.as_raw_descriptor(), ptr::null(), 0) };
423 
424             if ret < 0 {
425                 warn!(
426                     "Failed to restore security context: {}",
427                     io::Error::last_os_error()
428                 );
429             }
430         })
431     }
432 }
433 
434 struct ScopedUmask {
435     old: libc::mode_t,
436     mask: libc::mode_t,
437 }
438 
439 impl ScopedUmask {
new(mask: libc::mode_t) -> ScopedUmask440     fn new(mask: libc::mode_t) -> ScopedUmask {
441         ScopedUmask {
442             // SAFETY: this doesn't modify any memory and always succeeds.
443             old: unsafe { libc::umask(mask) },
444             mask,
445         }
446     }
447 }
448 
449 impl Drop for ScopedUmask {
drop(&mut self)450     fn drop(&mut self) {
451         // SAFETY: this doesn't modify any memory and always succeeds.
452         let previous = unsafe { libc::umask(self.old) };
453         debug_assert_eq!(
454             previous, self.mask,
455             "umask changed while holding ScopedUmask"
456         );
457     }
458 }
459 
460 struct ScopedFsetid(Caps);
461 impl Drop for ScopedFsetid {
drop(&mut self)462     fn drop(&mut self) {
463         if let Err(e) = raise_cap_fsetid(&mut self.0) {
464             error!(
465                 "Failed to restore CAP_FSETID: {}.  Some operations may be broken.",
466                 e
467             )
468         }
469     }
470 }
471 
raise_cap_fsetid(c: &mut Caps) -> io::Result<()>472 fn raise_cap_fsetid(c: &mut Caps) -> io::Result<()> {
473     c.update(&[Capability::Fsetid], CapSet::Effective, CapValue::Set)?;
474     c.apply()
475 }
476 
477 // Drops CAP_FSETID from the effective set for the current thread and returns an RAII guard that
478 // adds the capability back when it is dropped.
drop_cap_fsetid() -> io::Result<ScopedFsetid>479 fn drop_cap_fsetid() -> io::Result<ScopedFsetid> {
480     let mut caps = Caps::for_current_thread()?;
481     caps.update(&[Capability::Fsetid], CapSet::Effective, CapValue::Clear)?;
482     caps.apply()?;
483     Ok(ScopedFsetid(caps))
484 }
485 
ebadf() -> io::Error486 fn ebadf() -> io::Error {
487     io::Error::from_raw_os_error(libc::EBADF)
488 }
489 
eexist() -> io::Error490 fn eexist() -> io::Error {
491     io::Error::from_raw_os_error(libc::EEXIST)
492 }
493 
stat<F: AsRawDescriptor + ?Sized>(f: &F) -> io::Result<libc::stat64>494 fn stat<F: AsRawDescriptor + ?Sized>(f: &F) -> io::Result<libc::stat64> {
495     let mut st = MaybeUninit::<libc::stat64>::zeroed();
496 
497     // SAFETY: this is a constant value that is a nul-terminated string without interior nul bytes.
498     let pathname = unsafe { CStr::from_bytes_with_nul_unchecked(EMPTY_CSTR) };
499 
500     // SAFETY: the kernel will only write data in `st` and we check the return value.
501     syscall!(unsafe {
502         libc::fstatat64(
503             f.as_raw_descriptor(),
504             pathname.as_ptr(),
505             st.as_mut_ptr(),
506             libc::AT_EMPTY_PATH | libc::AT_SYMLINK_NOFOLLOW,
507         )
508     })?;
509 
510     // SAFETY: the kernel guarantees that the struct is now fully initialized.
511     Ok(unsafe { st.assume_init() })
512 }
513 
statat<D: AsRawDescriptor>(dir: &D, name: &CStr) -> io::Result<libc::stat64>514 fn statat<D: AsRawDescriptor>(dir: &D, name: &CStr) -> io::Result<libc::stat64> {
515     let mut st = MaybeUninit::<libc::stat64>::zeroed();
516 
517     // SAFETY: the kernel will only write data in `st` and we check the return value.
518     syscall!(unsafe {
519         libc::fstatat64(
520             dir.as_raw_descriptor(),
521             name.as_ptr(),
522             st.as_mut_ptr(),
523             libc::AT_SYMLINK_NOFOLLOW,
524         )
525     })?;
526 
527     // SAFETY: the kernel guarantees that the struct is now fully initialized.
528     Ok(unsafe { st.assume_init() })
529 }
530 
531 #[cfg(feature = "arc_quota")]
is_android_project_id(project_id: u32) -> bool532 fn is_android_project_id(project_id: u32) -> bool {
533     // The following constants defines the valid range of project ID used by
534     // Android and are taken from android_filesystem_config.h in Android
535     // codebase.
536     //
537     // Project IDs reserved for Android files on external storage. Total 100 IDs
538     // from PROJECT_ID_EXT_DEFAULT (1000) are reserved.
539     const PROJECT_ID_FOR_ANDROID_FILES: std::ops::RangeInclusive<u32> = 1000..=1099;
540     // Project IDs reserved for Android apps.
541     // The lower-limit of the range is PROJECT_ID_EXT_DATA_START.
542     // The upper-limit of the range differs before and after T. Here we use that
543     // of T (PROJECT_ID_APP_CACHE_END) as it is larger.
544     const PROJECT_ID_FOR_ANDROID_APPS: std::ops::RangeInclusive<u32> = 20000..=69999;
545 
546     PROJECT_ID_FOR_ANDROID_FILES.contains(&project_id)
547         || PROJECT_ID_FOR_ANDROID_APPS.contains(&project_id)
548 }
549 
550 /// Per-directory cache for `PassthroughFs::ascii_casefold_lookup()`.
551 ///
552 /// The key of the underlying `BTreeMap` is a lower-cased file name in the direcoty.
553 /// The value is the case-sensitive file name stored in the host file system.
554 /// We assume that if PassthroughFs has exclusive access to the filesystem, this cache exhaustively
555 ///  covers all file names that exist within the directory.
556 /// So every `PassthroughFs`'s handler that adds or removes files in the directory is expected to
557 /// update this cache.
558 struct CasefoldCache(BTreeMap<Vec<u8>, CString>);
559 
560 impl CasefoldCache {
new(dir: &InodeData) -> io::Result<Self>561     fn new(dir: &InodeData) -> io::Result<Self> {
562         let mut mp = BTreeMap::new();
563 
564         let mut buf = [0u8; 1024];
565         let mut offset = 0;
566         loop {
567             let mut read_dir = ReadDir::new(dir, offset, &mut buf[..])?;
568             if read_dir.remaining() == 0 {
569                 break;
570             }
571 
572             while let Some(entry) = read_dir.next() {
573                 offset = entry.offset as libc::off64_t;
574                 let entry_name = entry.name;
575                 mp.insert(
576                     entry_name.to_bytes().to_ascii_lowercase(),
577                     entry_name.to_owned(),
578                 );
579             }
580         }
581         Ok(Self(mp))
582     }
583 
insert(&mut self, name: &CStr)584     fn insert(&mut self, name: &CStr) {
585         let lower_case = name.to_bytes().to_ascii_lowercase();
586         self.0.insert(lower_case, name.into());
587     }
588 
lookup(&self, name: &[u8]) -> Option<CString>589     fn lookup(&self, name: &[u8]) -> Option<CString> {
590         let lower = name.to_ascii_lowercase();
591         self.0.get(&lower).cloned()
592     }
593 
remove(&mut self, name: &CStr)594     fn remove(&mut self, name: &CStr) {
595         let lower_case = name.to_bytes().to_ascii_lowercase();
596         self.0.remove(&lower_case);
597     }
598 }
599 
600 /// Time expiring mapping from an inode of a directory to `CasefoldCache` for the directory.
601 /// Each entry will be expired after `timeout`.
602 /// When ascii_casefold is disabled, this struct does nothing.
603 struct ExpiringCasefoldLookupCaches {
604     inner: ExpiringMap<Inode, CasefoldCache>,
605 }
606 
607 impl ExpiringCasefoldLookupCaches {
new(timeout: Duration) -> Self608     fn new(timeout: Duration) -> Self {
609         Self {
610             inner: ExpiringMap::new(timeout),
611         }
612     }
613 
insert(&mut self, parent: Inode, name: &CStr)614     fn insert(&mut self, parent: Inode, name: &CStr) {
615         if let Some(dir_cache) = self.inner.get_mut(&parent) {
616             dir_cache.insert(name);
617         }
618     }
619 
remove(&mut self, parent: Inode, name: &CStr)620     fn remove(&mut self, parent: Inode, name: &CStr) {
621         if let Some(dir_cache) = self.inner.get_mut(&parent) {
622             dir_cache.remove(name);
623         }
624     }
625 
forget(&mut self, parent: Inode)626     fn forget(&mut self, parent: Inode) {
627         self.inner.remove(&parent);
628     }
629 
630     /// Get `CasefoldCache` for the given directory.
631     /// If the cache doesn't exist, generate it by fetching directory information with
632     /// `getdents64()`.
get(&mut self, parent: &InodeData) -> io::Result<&CasefoldCache>633     fn get(&mut self, parent: &InodeData) -> io::Result<&CasefoldCache> {
634         self.inner
635             .get_or_insert_with(&parent.inode, || CasefoldCache::new(parent))
636     }
637 
638     #[cfg(test)]
exists_in_cache(&mut self, parent: Inode, name: &CStr) -> bool639     fn exists_in_cache(&mut self, parent: Inode, name: &CStr) -> bool {
640         if let Some(dir_cache) = self.inner.get(&parent) {
641             dir_cache.lookup(name.to_bytes()).is_some()
642         } else {
643             false
644         }
645     }
646 }
647 
648 /// A file system that simply "passes through" all requests it receives to the underlying file
649 /// system. To keep the implementation simple it servers the contents of its root directory. Users
650 /// that wish to serve only a specific directory should set up the environment so that that
651 /// directory ends up as the root of the file system process. One way to accomplish this is via a
652 /// combination of mount namespaces and the pivot_root system call.
653 pub struct PassthroughFs {
654     // Mutex that must be acquired before executing a process-wide operation such as fchdir.
655     process_lock: Mutex<()>,
656     // virtio-fs tag that the guest uses when mounting. This is only used for debugging
657     // when tracing is enabled.
658     #[cfg_attr(not(feature = "trace_marker"), allow(dead_code))]
659     tag: String,
660 
661     // File descriptors for various points in the file system tree.
662     inodes: Mutex<MultikeyBTreeMap<Inode, InodeAltKey, Arc<InodeData>>>,
663     next_inode: AtomicU64,
664 
665     // File descriptors for open files and directories. Unlike the fds in `inodes`, these _can_ be
666     // used for reading and writing data.
667     handles: Mutex<BTreeMap<Handle, Arc<HandleData>>>,
668     next_handle: AtomicU64,
669 
670     // File descriptor pointing to the `/proc` directory. This is used to convert an fd from
671     // `inodes` into one that can go into `handles`. This is accomplished by reading the
672     // `self/fd/{}` symlink. We keep an open fd here in case the file system tree that we are meant
673     // to be serving doesn't have access to `/proc`.
674     proc: File,
675 
676     // Whether writeback caching is enabled for this directory. This will only be true when
677     // `cfg.writeback` is true and `init` was called with `FsOptions::WRITEBACK_CACHE`.
678     writeback: AtomicBool,
679 
680     // Whether zero message opens are supported by the kernel driver.
681     zero_message_open: AtomicBool,
682 
683     // Whether zero message opendir is supported by the kernel driver.
684     zero_message_opendir: AtomicBool,
685 
686     // Used to communicate with other processes using D-Bus.
687     #[cfg(feature = "arc_quota")]
688     dbus_connection: Option<Mutex<dbus::blocking::Connection>>,
689     #[cfg(feature = "arc_quota")]
690     dbus_fd: Option<std::os::unix::io::RawFd>,
691 
692     // Time-expiring cache for `ascii_casefold_lookup()`.
693     // The key is an inode of a directory, and the value is a cache for the directory.
694     // Each value will be expired `cfg.timeout` after it's created.
695     //
696     // TODO(b/267748212): Instead of per-device Mutex, we might want to have per-directory Mutex
697     // if we use PassthroughFs in multi-threaded environments.
698     expiring_casefold_lookup_caches: Option<Mutex<ExpiringCasefoldLookupCaches>>,
699 
700     cfg: Config,
701 }
702 
703 impl std::fmt::Debug for PassthroughFs {
fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result704     fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
705         f.debug_struct("PassthroughFs")
706             .field("tag", &self.tag)
707             .field("next_inode", &self.next_inode)
708             .field("next_handle", &self.next_handle)
709             .field("proc", &self.proc)
710             .field("writeback", &self.writeback)
711             .field("zero_message_open", &self.zero_message_open)
712             .field("zero_message_opendir", &self.zero_message_opendir)
713             .field("cfg", &self.cfg)
714             .finish()
715     }
716 }
717 
718 impl PassthroughFs {
new(tag: &str, cfg: Config) -> io::Result<PassthroughFs>719     pub fn new(tag: &str, cfg: Config) -> io::Result<PassthroughFs> {
720         // SAFETY: this is a constant value that is a nul-terminated string without interior
721         // nul bytes.
722         let proc_cstr = unsafe { CStr::from_bytes_with_nul_unchecked(PROC_CSTR) };
723 
724         // SAFETY: this doesn't modify any memory and we check the return value.
725         let raw_descriptor = syscall!(unsafe {
726             libc::openat64(
727                 libc::AT_FDCWD,
728                 proc_cstr.as_ptr(),
729                 libc::O_PATH | libc::O_NOFOLLOW | libc::O_CLOEXEC,
730             )
731         })?;
732 
733         // Privileged UIDs can use D-Bus to perform some operations.
734         #[cfg(feature = "arc_quota")]
735         let (dbus_connection, dbus_fd) = if cfg.privileged_quota_uids.is_empty() {
736             (None, None)
737         } else {
738             let mut channel = dbus::channel::Channel::get_private(dbus::channel::BusType::System)
739                 .map_err(|e| io::Error::new(io::ErrorKind::Other, e))?;
740             channel.set_watch_enabled(true);
741             let dbus_fd = channel.watch().fd;
742             channel.set_watch_enabled(false);
743             (
744                 Some(Mutex::new(dbus::blocking::Connection::from(channel))),
745                 Some(dbus_fd),
746             )
747         };
748 
749         // SAFETY: safe because we just opened this descriptor.
750         let proc = unsafe { File::from_raw_descriptor(raw_descriptor) };
751 
752         let expiring_casefold_lookup_caches = if cfg.ascii_casefold {
753             Some(Mutex::new(ExpiringCasefoldLookupCaches::new(cfg.timeout)))
754         } else {
755             None
756         };
757 
758         let passthroughfs = PassthroughFs {
759             process_lock: Mutex::new(()),
760             tag: tag.to_string(),
761             inodes: Mutex::new(MultikeyBTreeMap::new()),
762             next_inode: AtomicU64::new(ROOT_ID + 1),
763 
764             handles: Mutex::new(BTreeMap::new()),
765             next_handle: AtomicU64::new(1),
766 
767             proc,
768 
769             writeback: AtomicBool::new(false),
770             zero_message_open: AtomicBool::new(false),
771             zero_message_opendir: AtomicBool::new(false),
772 
773             #[cfg(feature = "arc_quota")]
774             dbus_connection,
775             #[cfg(feature = "arc_quota")]
776             dbus_fd,
777             expiring_casefold_lookup_caches,
778             cfg,
779         };
780 
781         cros_tracing::trace_simple_print!(
782             VirtioFs,
783             "New PassthroughFS initialized: {:?}",
784             passthroughfs
785         );
786         Ok(passthroughfs)
787     }
788 
cfg(&self) -> &Config789     pub fn cfg(&self) -> &Config {
790         &self.cfg
791     }
792 
keep_rds(&self) -> Vec<RawDescriptor>793     pub fn keep_rds(&self) -> Vec<RawDescriptor> {
794         #[cfg_attr(not(feature = "arc_quota"), allow(unused_mut))]
795         let mut keep_rds = vec![self.proc.as_raw_descriptor()];
796         #[cfg(feature = "arc_quota")]
797         if let Some(fd) = self.dbus_fd {
798             keep_rds.push(fd);
799         }
800         keep_rds
801     }
802 
rewrite_xattr_name<'xattr>(&self, name: &'xattr CStr) -> Cow<'xattr, CStr>803     fn rewrite_xattr_name<'xattr>(&self, name: &'xattr CStr) -> Cow<'xattr, CStr> {
804         if !self.cfg.rewrite_security_xattrs {
805             return Cow::Borrowed(name);
806         }
807 
808         // Does not include nul-terminator.
809         let buf = name.to_bytes();
810         if !buf.starts_with(SECURITY_XATTR) || buf == SELINUX_XATTR {
811             return Cow::Borrowed(name);
812         }
813 
814         let mut newname = USER_VIRTIOFS_XATTR.to_vec();
815         newname.extend_from_slice(buf);
816 
817         // The unwrap is safe here because the prefix doesn't contain any interior nul-bytes and the
818         // to_bytes() call above will not return a byte slice with any interior nul-bytes either.
819         Cow::Owned(CString::new(newname).expect("Failed to re-write xattr name"))
820     }
821 
find_inode(&self, inode: Inode) -> io::Result<Arc<InodeData>>822     fn find_inode(&self, inode: Inode) -> io::Result<Arc<InodeData>> {
823         self.inodes
824             .lock()
825             .get(&inode)
826             .map(Arc::clone)
827             .ok_or_else(ebadf)
828     }
829 
find_handle(&self, handle: Handle, inode: Inode) -> io::Result<Arc<HandleData>>830     fn find_handle(&self, handle: Handle, inode: Inode) -> io::Result<Arc<HandleData>> {
831         self.handles
832             .lock()
833             .get(&handle)
834             .filter(|hd| hd.inode == inode)
835             .map(Arc::clone)
836             .ok_or_else(ebadf)
837     }
838 
open_fd(&self, fd: RawDescriptor, flags: i32) -> io::Result<File>839     fn open_fd(&self, fd: RawDescriptor, flags: i32) -> io::Result<File> {
840         let pathname = CString::new(format!("self/fd/{}", fd))
841             .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
842 
843         // SAFETY: this doesn't modify any memory and we check the return value. We don't really
844         // check `flags` because if the kernel can't handle poorly specified flags then we have
845         // much bigger problems. Also, clear the `O_NOFOLLOW` flag if it is set since we need
846         // to follow the `/proc/self/fd` symlink to get the file.
847         let raw_descriptor = syscall!(unsafe {
848             libc::openat64(
849                 self.proc.as_raw_descriptor(),
850                 pathname.as_ptr(),
851                 (flags | libc::O_CLOEXEC) & !(libc::O_NOFOLLOW | libc::O_DIRECT),
852             )
853         })?;
854 
855         // SAFETY: safe because we just opened this descriptor.
856         Ok(unsafe { File::from_raw_descriptor(raw_descriptor) })
857     }
858 
859     /// Modifies the provided open flags based on the writeback caching configuration.
860     /// Return the updated open flags.
update_open_flags(&self, mut flags: i32) -> i32861     fn update_open_flags(&self, mut flags: i32) -> i32 {
862         // When writeback caching is enabled, the kernel may send read requests even if the
863         // userspace program opened the file write-only. So we need to ensure that we have opened
864         // the file for reading as well as writing.
865         let writeback = self.writeback.load(Ordering::Relaxed);
866         if writeback && flags & libc::O_ACCMODE == libc::O_WRONLY {
867             flags &= !libc::O_ACCMODE;
868             flags |= libc::O_RDWR;
869         }
870 
871         // When writeback caching is enabled the kernel is responsible for handling `O_APPEND`.
872         // However, this breaks atomicity as the file may have changed on disk, invalidating the
873         // cached copy of the data in the kernel and the offset that the kernel thinks is the end of
874         // the file. Just allow this for now as it is the user's responsibility to enable writeback
875         // caching only for directories that are not shared. It also means that we need to clear the
876         // `O_APPEND` flag.
877         if writeback && flags & libc::O_APPEND != 0 {
878             flags &= !libc::O_APPEND;
879         }
880 
881         flags
882     }
883 
open_inode(&self, inode: &InodeData, mut flags: i32) -> io::Result<File>884     fn open_inode(&self, inode: &InodeData, mut flags: i32) -> io::Result<File> {
885         // handle writeback caching cases
886         flags = self.update_open_flags(flags);
887 
888         self.open_fd(inode.as_raw_descriptor(), flags)
889     }
890 
891     // Increases the inode refcount and returns the inode.
increase_inode_refcount(&self, inode_data: &InodeData) -> Inode892     fn increase_inode_refcount(&self, inode_data: &InodeData) -> Inode {
893         // Matches with the release store in `forget`.
894         inode_data.refcount.fetch_add(1, Ordering::Acquire);
895         inode_data.inode
896     }
897 
898     // Creates a new entry for `f` or increases the refcount of the existing entry for `f`.
899     // The inodes mutex lock must not be already taken by the same thread otherwise this
900     // will deadlock.
add_entry(&self, f: File, st: libc::stat64, open_flags: libc::c_int, path: String) -> Entry901     fn add_entry(&self, f: File, st: libc::stat64, open_flags: libc::c_int, path: String) -> Entry {
902         let mut inodes = self.inodes.lock();
903 
904         let altkey = InodeAltKey {
905             ino: st.st_ino,
906             dev: st.st_dev,
907         };
908 
909         let inode = if let Some(data) = inodes.get_alt(&altkey) {
910             self.increase_inode_refcount(data)
911         } else {
912             let inode = self.next_inode.fetch_add(1, Ordering::Relaxed);
913             inodes.insert(
914                 inode,
915                 altkey,
916                 Arc::new(InodeData {
917                     inode,
918                     file: Mutex::new((f, open_flags)),
919                     refcount: AtomicU64::new(1),
920                     filetype: st.st_mode.into(),
921                     path,
922                 }),
923             );
924 
925             inode
926         };
927 
928         Entry {
929             inode,
930             generation: 0,
931             attr: st,
932             // We use the same timeout for the attribute and the entry.
933             attr_timeout: self.cfg.timeout,
934             entry_timeout: self.cfg.timeout,
935         }
936     }
937 
938     /// Acquires lock of `expiring_casefold_lookup_caches` if `ascii_casefold` is enabled.
lock_casefold_lookup_caches(&self) -> Option<MutexGuard<'_, ExpiringCasefoldLookupCaches>>939     fn lock_casefold_lookup_caches(&self) -> Option<MutexGuard<'_, ExpiringCasefoldLookupCaches>> {
940         self.expiring_casefold_lookup_caches
941             .as_ref()
942             .map(|c| c.lock())
943     }
944 
945     // Returns an actual case-sensitive file name that matches with the given `name`.
946     // Returns `Ok(None)` if no file matches with the give `name`.
947     // This function will panic if casefold is not enabled.
get_case_unfolded_name( &self, parent: &InodeData, name: &[u8], ) -> io::Result<Option<CString>>948     fn get_case_unfolded_name(
949         &self,
950         parent: &InodeData,
951         name: &[u8],
952     ) -> io::Result<Option<CString>> {
953         let mut caches = self
954             .lock_casefold_lookup_caches()
955             .expect("casefold must be enabled");
956         let dir_cache = caches.get(parent)?;
957         Ok(dir_cache.lookup(name))
958     }
959 
960     // Performs an ascii case insensitive lookup.
ascii_casefold_lookup(&self, parent: &InodeData, name: &[u8]) -> io::Result<Entry>961     fn ascii_casefold_lookup(&self, parent: &InodeData, name: &[u8]) -> io::Result<Entry> {
962         match self.get_case_unfolded_name(parent, name)? {
963             None => Err(io::Error::from_raw_os_error(libc::ENOENT)),
964             Some(actual_name) => self.do_lookup(parent, &actual_name),
965         }
966     }
967 
968     #[cfg(test)]
exists_in_casefold_cache(&self, parent: Inode, name: &CStr) -> bool969     fn exists_in_casefold_cache(&self, parent: Inode, name: &CStr) -> bool {
970         let mut cache = self
971             .lock_casefold_lookup_caches()
972             .expect("casefold must be enabled");
973         cache.exists_in_cache(parent, name)
974     }
975 
do_lookup(&self, parent: &InodeData, name: &CStr) -> io::Result<Entry>976     fn do_lookup(&self, parent: &InodeData, name: &CStr) -> io::Result<Entry> {
977         let st = statat(parent, name)?;
978 
979         let altkey = InodeAltKey {
980             ino: st.st_ino,
981             dev: st.st_dev,
982         };
983 
984         // Check if we already have an entry before opening a new file.
985         if let Some(data) = self.inodes.lock().get_alt(&altkey) {
986             // Return the same inode with the reference counter increased.
987             return Ok(Entry {
988                 inode: self.increase_inode_refcount(data),
989                 generation: 0,
990                 attr: st,
991                 // We use the same timeout for the attribute and the entry.
992                 attr_timeout: self.cfg.timeout,
993                 entry_timeout: self.cfg.timeout,
994             });
995         }
996 
997         // Open a regular file with O_RDONLY to store in `InodeData` so explicit open requests can
998         // be skipped later if the ZERO_MESSAGE_{OPEN,OPENDIR} features are enabled.
999         // If the crosvm process doesn't have a read permission, fall back to O_PATH below.
1000         let mut flags = libc::O_RDONLY | libc::O_NOFOLLOW | libc::O_CLOEXEC;
1001         match FileType::from(st.st_mode) {
1002             FileType::Regular => {}
1003             FileType::Directory => flags |= libc::O_DIRECTORY,
1004             FileType::Other => flags |= libc::O_PATH,
1005         };
1006 
1007         // SAFETY: this doesn't modify any memory and we check the return value.
1008         let fd = match unsafe {
1009             syscall!(libc::openat64(
1010                 parent.as_raw_descriptor(),
1011                 name.as_ptr(),
1012                 flags
1013             ))
1014         } {
1015             Ok(fd) => fd,
1016             Err(e) if e.errno() == libc::EACCES => {
1017                 // If O_RDONLY is unavailable, fall back to O_PATH to get an FD to store in
1018                 // `InodeData`.
1019                 // Note that some operations which should be allowed without read permissions
1020                 // require syscalls that don't support O_PATH fds. For those syscalls, we will
1021                 // need to fall back to their path-based equivalents with /self/fd/${FD}.
1022                 // e.g. `fgetxattr()` for an O_PATH FD fails while `getxaattr()` for /self/fd/${FD}
1023                 // works.
1024                 flags |= libc::O_PATH;
1025                 // SAFETY: this doesn't modify any memory and we check the return value.
1026                 unsafe {
1027                     syscall!(libc::openat64(
1028                         parent.as_raw_descriptor(),
1029                         name.as_ptr(),
1030                         flags
1031                     ))
1032                 }?
1033             }
1034             Err(e) => {
1035                 return Err(e.into());
1036             }
1037         };
1038 
1039         // SAFETY: safe because we own the fd.
1040         let f = unsafe { File::from_raw_descriptor(fd) };
1041         let path = format!(
1042             "{}/{}",
1043             parent.path.clone(),
1044             name.to_str().unwrap_or("<non UTF-8 str>")
1045         );
1046         // We made sure the lock acquired for `self.inodes` is released automatically when
1047         // the if block above is exited, so a call to `self.add_entry()` should not cause a deadlock
1048         // here. This would not be the case if this were executed in an else block instead.
1049         Ok(self.add_entry(f, st, flags, path))
1050     }
1051 
get_cache_open_options(&self, flags: u32) -> OpenOptions1052     fn get_cache_open_options(&self, flags: u32) -> OpenOptions {
1053         let mut opts = OpenOptions::empty();
1054         match self.cfg.cache_policy {
1055             // We only set the direct I/O option on files.
1056             CachePolicy::Never => opts.set(
1057                 OpenOptions::DIRECT_IO,
1058                 flags & (libc::O_DIRECTORY as u32) == 0,
1059             ),
1060             CachePolicy::Always => {
1061                 opts |= if flags & (libc::O_DIRECTORY as u32) == 0 {
1062                     OpenOptions::KEEP_CACHE
1063                 } else {
1064                     OpenOptions::CACHE_DIR
1065                 }
1066             }
1067             _ => {}
1068         };
1069         opts
1070     }
1071 
1072     // Performs lookup using original name first, if it fails and ascii_casefold is enabled,
1073     // it tries to unfold the name and do lookup again.
do_lookup_with_casefold_fallback( &self, parent: &InodeData, name: &CStr, ) -> io::Result<Entry>1074     fn do_lookup_with_casefold_fallback(
1075         &self,
1076         parent: &InodeData,
1077         name: &CStr,
1078     ) -> io::Result<Entry> {
1079         let mut res = self.do_lookup(parent, name);
1080         // If `ascii_casefold` is enabled, fallback to `ascii_casefold_lookup()`.
1081         if res.is_err() && self.cfg.ascii_casefold {
1082             res = self.ascii_casefold_lookup(parent, name.to_bytes());
1083         }
1084         res
1085     }
1086 
do_open(&self, inode: Inode, flags: u32) -> io::Result<(Option<Handle>, OpenOptions)>1087     fn do_open(&self, inode: Inode, flags: u32) -> io::Result<(Option<Handle>, OpenOptions)> {
1088         let inode_data = self.find_inode(inode)?;
1089 
1090         let file = Mutex::new(self.open_inode(&inode_data, flags as i32)?);
1091 
1092         let handle = self.next_handle.fetch_add(1, Ordering::Relaxed);
1093         let data = HandleData { inode, file };
1094 
1095         self.handles.lock().insert(handle, Arc::new(data));
1096 
1097         let opts = self.get_cache_open_options(flags);
1098 
1099         Ok((Some(handle), opts))
1100     }
1101 
do_open_at( &self, parent_data: Arc<InodeData>, name: &CStr, inode: Inode, flags: u32, ) -> io::Result<(Option<Handle>, OpenOptions)>1102     fn do_open_at(
1103         &self,
1104         parent_data: Arc<InodeData>,
1105         name: &CStr,
1106         inode: Inode,
1107         flags: u32,
1108     ) -> io::Result<(Option<Handle>, OpenOptions)> {
1109         let open_flags = self.update_open_flags(flags as i32);
1110 
1111         let fd_open = syscall!(
1112             // SAFETY: return value is checked.
1113             unsafe {
1114                 libc::openat64(
1115                     parent_data.as_raw_descriptor(),
1116                     name.as_ptr(),
1117                     (open_flags | libc::O_CLOEXEC) & !(libc::O_NOFOLLOW | libc::O_DIRECT),
1118                 )
1119             }
1120         )?;
1121 
1122         // SAFETY: fd_open is valid
1123         let file_open = unsafe { File::from_raw_descriptor(fd_open) };
1124         let handle = self.next_handle.fetch_add(1, Ordering::Relaxed);
1125         let data = HandleData {
1126             inode,
1127             file: Mutex::new(file_open),
1128         };
1129 
1130         self.handles.lock().insert(handle, Arc::new(data));
1131 
1132         let opts = self.get_cache_open_options(open_flags as u32);
1133         Ok((Some(handle), opts))
1134     }
1135 
do_release(&self, inode: Inode, handle: Handle) -> io::Result<()>1136     fn do_release(&self, inode: Inode, handle: Handle) -> io::Result<()> {
1137         let mut handles = self.handles.lock();
1138 
1139         if let btree_map::Entry::Occupied(e) = handles.entry(handle) {
1140             if e.get().inode == inode {
1141                 // We don't need to close the file here because that will happen automatically when
1142                 // the last `Arc` is dropped.
1143                 e.remove();
1144                 return Ok(());
1145             }
1146         }
1147 
1148         Err(ebadf())
1149     }
1150 
do_getattr(&self, inode: &InodeData) -> io::Result<(libc::stat64, Duration)>1151     fn do_getattr(&self, inode: &InodeData) -> io::Result<(libc::stat64, Duration)> {
1152         let st = stat(inode)?;
1153 
1154         Ok((st, self.cfg.timeout))
1155     }
1156 
do_unlink(&self, parent: &InodeData, name: &CStr, flags: libc::c_int) -> io::Result<()>1157     fn do_unlink(&self, parent: &InodeData, name: &CStr, flags: libc::c_int) -> io::Result<()> {
1158         // SAFETY: this doesn't modify any memory and we check the return value.
1159         syscall!(unsafe { libc::unlinkat(parent.as_raw_descriptor(), name.as_ptr(), flags) })?;
1160         Ok(())
1161     }
1162 
do_fsync<F: AsRawDescriptor>(&self, file: &F, datasync: bool) -> io::Result<()>1163     fn do_fsync<F: AsRawDescriptor>(&self, file: &F, datasync: bool) -> io::Result<()> {
1164         // SAFETY: this doesn't modify any memory and we check the return value.
1165         syscall!(unsafe {
1166             if datasync {
1167                 libc::fdatasync(file.as_raw_descriptor())
1168             } else {
1169                 libc::fsync(file.as_raw_descriptor())
1170             }
1171         })?;
1172 
1173         Ok(())
1174     }
1175 
1176     // Changes the CWD to `self.proc`, runs `f`, and then changes the CWD back to the root
1177     // directory. This effectively emulates an *at syscall starting at /proc, which is useful when
1178     // there is no *at syscall available. Panics if any of the fchdir calls fail or if there is no
1179     // root inode.
1180     //
1181     // NOTE: this method acquires an `self`-wide lock. If any locks are acquired in `f`, care must
1182     // be taken to avoid the risk of deadlocks.
with_proc_chdir<F, T>(&self, f: F) -> T where F: FnOnce() -> T,1183     fn with_proc_chdir<F, T>(&self, f: F) -> T
1184     where
1185         F: FnOnce() -> T,
1186     {
1187         let root = self.find_inode(ROOT_ID).expect("failed to find root inode");
1188 
1189         // Acquire a lock for `fchdir`.
1190         let _proc_lock = self.process_lock.lock();
1191         // SAFETY: this doesn't modify any memory and we check the return value. Since the
1192         // fchdir should never fail we just use debug_asserts.
1193         let proc_cwd = unsafe { libc::fchdir(self.proc.as_raw_descriptor()) };
1194         debug_assert_eq!(
1195             proc_cwd,
1196             0,
1197             "failed to fchdir to /proc: {}",
1198             io::Error::last_os_error()
1199         );
1200 
1201         let res = f();
1202 
1203         // SAFETY: this doesn't modify any memory and we check the return value. Since the
1204         // fchdir should never fail we just use debug_asserts.
1205         let root_cwd = unsafe { libc::fchdir(root.as_raw_descriptor()) };
1206         debug_assert_eq!(
1207             root_cwd,
1208             0,
1209             "failed to fchdir back to root directory: {}",
1210             io::Error::last_os_error()
1211         );
1212 
1213         res
1214     }
1215 
do_getxattr(&self, inode: &InodeData, name: &CStr, value: &mut [u8]) -> io::Result<usize>1216     fn do_getxattr(&self, inode: &InodeData, name: &CStr, value: &mut [u8]) -> io::Result<usize> {
1217         let file = inode.file.lock();
1218         let o_path_file = (file.1 & libc::O_PATH) != 0;
1219         let res = if o_path_file {
1220             // For FDs opened with `O_PATH`, we cannot call `fgetxattr` normally. Instead we
1221             // emulate an _at syscall by changing the CWD to /proc, running the path based syscall,
1222             //  and then setting the CWD back to the root directory.
1223             let path = CString::new(format!("self/fd/{}", file.0.as_raw_descriptor()))
1224                 .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
1225 
1226             // SAFETY: this will only modify `value` and we check the return value.
1227             self.with_proc_chdir(|| unsafe {
1228                 libc::getxattr(
1229                     path.as_ptr(),
1230                     name.as_ptr(),
1231                     value.as_mut_ptr() as *mut libc::c_void,
1232                     value.len() as libc::size_t,
1233                 )
1234             })
1235         } else {
1236             // For regular files and directories, we can just use fgetxattr.
1237             // SAFETY: this will only write to `value` and we check the return value.
1238             unsafe {
1239                 libc::fgetxattr(
1240                     file.0.as_raw_descriptor(),
1241                     name.as_ptr(),
1242                     value.as_mut_ptr() as *mut libc::c_void,
1243                     value.len() as libc::size_t,
1244                 )
1245             }
1246         };
1247 
1248         if res < 0 {
1249             Err(io::Error::last_os_error())
1250         } else {
1251             Ok(res as usize)
1252         }
1253     }
1254 
get_encryption_policy_ex<R: io::Read>( &self, inode: Inode, handle: Handle, mut r: R, ) -> io::Result<IoctlReply>1255     fn get_encryption_policy_ex<R: io::Read>(
1256         &self,
1257         inode: Inode,
1258         handle: Handle,
1259         mut r: R,
1260     ) -> io::Result<IoctlReply> {
1261         let data: Arc<dyn AsRawDescriptor> = if self.zero_message_open.load(Ordering::Relaxed) {
1262             self.find_inode(inode)?
1263         } else {
1264             self.find_handle(handle, inode)?
1265         };
1266 
1267         // SAFETY: this struct only has integer fields and any value is valid.
1268         let mut arg = unsafe { MaybeUninit::<fscrypt_get_policy_ex_arg>::zeroed().assume_init() };
1269         r.read_exact(arg.policy_size.as_bytes_mut())?;
1270 
1271         let policy_size = cmp::min(arg.policy_size, size_of::<fscrypt_policy>() as u64);
1272         arg.policy_size = policy_size;
1273 
1274         let res =
1275             // SAFETY: the kernel will only write to `arg` and we check the return value.
1276             unsafe { ioctl_with_mut_ptr(&*data, FS_IOC_GET_ENCRYPTION_POLICY_EX(), &mut arg) };
1277         if res < 0 {
1278             Ok(IoctlReply::Done(Err(io::Error::last_os_error())))
1279         } else {
1280             let len = size_of::<u64>() + arg.policy_size as usize;
1281             Ok(IoctlReply::Done(Ok(<&[u8]>::from(&arg)[..len].to_vec())))
1282         }
1283     }
1284 
get_fsxattr(&self, inode: Inode, handle: Handle) -> io::Result<IoctlReply>1285     fn get_fsxattr(&self, inode: Inode, handle: Handle) -> io::Result<IoctlReply> {
1286         let data: Arc<dyn AsRawDescriptor> = if self.zero_message_open.load(Ordering::Relaxed) {
1287             self.find_inode(inode)?
1288         } else {
1289             self.find_handle(handle, inode)?
1290         };
1291 
1292         let mut buf = MaybeUninit::<fsxattr>::zeroed();
1293 
1294         // SAFETY: the kernel will only write to `buf` and we check the return value.
1295         let res = unsafe { ioctl_with_mut_ptr(&*data, FS_IOC_FSGETXATTR(), buf.as_mut_ptr()) };
1296         if res < 0 {
1297             Ok(IoctlReply::Done(Err(io::Error::last_os_error())))
1298         } else {
1299             // SAFETY: the kernel guarantees that the policy is now initialized.
1300             let xattr = unsafe { buf.assume_init() };
1301             Ok(IoctlReply::Done(Ok(xattr.as_bytes().to_vec())))
1302         }
1303     }
1304 
set_fsxattr<R: io::Read>( &self, #[cfg_attr(not(feature = "arc_quota"), allow(unused_variables))] ctx: Context, inode: Inode, handle: Handle, mut r: R, ) -> io::Result<IoctlReply>1305     fn set_fsxattr<R: io::Read>(
1306         &self,
1307         #[cfg_attr(not(feature = "arc_quota"), allow(unused_variables))] ctx: Context,
1308         inode: Inode,
1309         handle: Handle,
1310         mut r: R,
1311     ) -> io::Result<IoctlReply> {
1312         let data: Arc<dyn AsRawDescriptor> = if self.zero_message_open.load(Ordering::Relaxed) {
1313             self.find_inode(inode)?
1314         } else {
1315             self.find_handle(handle, inode)?
1316         };
1317 
1318         let mut in_attr = fsxattr::new_zeroed();
1319         r.read_exact(in_attr.as_bytes_mut())?;
1320 
1321         #[cfg(feature = "arc_quota")]
1322         let st = stat(&*data)?;
1323 
1324         // Changing quota project ID requires CAP_FOWNER or being file owner.
1325         // Here we use privileged_quota_uids because we cannot perform a CAP_FOWNER check.
1326         #[cfg(feature = "arc_quota")]
1327         if ctx.uid == st.st_uid || self.cfg.privileged_quota_uids.contains(&ctx.uid) {
1328             // Get the current fsxattr.
1329             let mut buf = MaybeUninit::<fsxattr>::zeroed();
1330             // SAFETY: the kernel will only write to `buf` and we check the return value.
1331             let res = unsafe { ioctl_with_mut_ptr(&*data, FS_IOC_FSGETXATTR(), buf.as_mut_ptr()) };
1332             if res < 0 {
1333                 return Ok(IoctlReply::Done(Err(io::Error::last_os_error())));
1334             }
1335             // SAFETY: the kernel guarantees that the policy is now initialized.
1336             let current_attr = unsafe { buf.assume_init() };
1337 
1338             // Project ID cannot be changed inside a user namespace.
1339             // Use Spaced to avoid this restriction.
1340             if current_attr.fsx_projid != in_attr.fsx_projid {
1341                 let connection = self.dbus_connection.as_ref().unwrap().lock();
1342                 let proxy = connection.with_proxy(
1343                     "org.chromium.Spaced",
1344                     "/org/chromium/Spaced",
1345                     DEFAULT_DBUS_TIMEOUT,
1346                 );
1347                 let project_id = in_attr.fsx_projid;
1348                 if !is_android_project_id(project_id) {
1349                     return Err(io::Error::from_raw_os_error(libc::EINVAL));
1350                 }
1351                 let file_clone = base::SafeDescriptor::try_from(&*data)?;
1352                 match proxy.set_project_id(file_clone.into(), project_id) {
1353                     Ok(r) => {
1354                         let r = SetProjectIdReply::parse_from_bytes(&r)
1355                             .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
1356                         if !r.success {
1357                             return Ok(IoctlReply::Done(Err(io::Error::from_raw_os_error(
1358                                 r.error,
1359                             ))));
1360                         }
1361                     }
1362                     Err(e) => {
1363                         return Err(io::Error::new(io::ErrorKind::Other, e));
1364                     }
1365                 };
1366             }
1367         }
1368 
1369         //  SAFETY: this doesn't modify any memory and we check the return value.
1370         let res = unsafe { ioctl_with_ptr(&*data, FS_IOC_FSSETXATTR(), &in_attr) };
1371         if res < 0 {
1372             Ok(IoctlReply::Done(Err(io::Error::last_os_error())))
1373         } else {
1374             Ok(IoctlReply::Done(Ok(Vec::new())))
1375         }
1376     }
1377 
get_flags(&self, inode: Inode, handle: Handle) -> io::Result<IoctlReply>1378     fn get_flags(&self, inode: Inode, handle: Handle) -> io::Result<IoctlReply> {
1379         let data: Arc<dyn AsRawDescriptor> = if self.zero_message_open.load(Ordering::Relaxed) {
1380             self.find_inode(inode)?
1381         } else {
1382             self.find_handle(handle, inode)?
1383         };
1384 
1385         // The ioctl encoding is a long but the parameter is actually an int.
1386         let mut flags: c_int = 0;
1387 
1388         // SAFETY: the kernel will only write to `flags` and we check the return value.
1389         let res = unsafe { ioctl_with_mut_ptr(&*data, FS_IOC_GETFLAGS(), &mut flags) };
1390         if res < 0 {
1391             Ok(IoctlReply::Done(Err(io::Error::last_os_error())))
1392         } else {
1393             Ok(IoctlReply::Done(Ok(flags.to_ne_bytes().to_vec())))
1394         }
1395     }
1396 
set_flags<R: io::Read>( &self, #[cfg_attr(not(feature = "arc_quota"), allow(unused_variables))] ctx: Context, inode: Inode, handle: Handle, mut r: R, ) -> io::Result<IoctlReply>1397     fn set_flags<R: io::Read>(
1398         &self,
1399         #[cfg_attr(not(feature = "arc_quota"), allow(unused_variables))] ctx: Context,
1400         inode: Inode,
1401         handle: Handle,
1402         mut r: R,
1403     ) -> io::Result<IoctlReply> {
1404         let data: Arc<dyn AsRawDescriptor> = if self.zero_message_open.load(Ordering::Relaxed) {
1405             self.find_inode(inode)?
1406         } else {
1407             self.find_handle(handle, inode)?
1408         };
1409 
1410         // The ioctl encoding is a long but the parameter is actually an int.
1411         let mut in_flags: c_int = 0;
1412         r.read_exact(in_flags.as_bytes_mut())?;
1413 
1414         #[cfg(feature = "arc_quota")]
1415         let st = stat(&*data)?;
1416 
1417         // Only privleged uid can perform FS_IOC_SETFLAGS through cryptohome.
1418         #[cfg(feature = "arc_quota")]
1419         if ctx.uid == st.st_uid || self.cfg.privileged_quota_uids.contains(&ctx.uid) {
1420             // Get the current flag.
1421             let mut buf = MaybeUninit::<c_int>::zeroed();
1422             // SAFETY: the kernel will only write to `buf` and we check the return value.
1423             let res = unsafe { ioctl_with_mut_ptr(&*data, FS_IOC_GETFLAGS(), buf.as_mut_ptr()) };
1424             if res < 0 {
1425                 return Ok(IoctlReply::Done(Err(io::Error::last_os_error())));
1426             }
1427             // SAFETY: the kernel guarantees that the policy is now initialized.
1428             let current_flags = unsafe { buf.assume_init() };
1429 
1430             // Project inheritance flag cannot be changed inside a user namespace.
1431             // Use Spaced to avoid this restriction.
1432             if (in_flags & FS_PROJINHERIT_FL) != (current_flags & FS_PROJINHERIT_FL) {
1433                 let connection = self.dbus_connection.as_ref().unwrap().lock();
1434                 let proxy = connection.with_proxy(
1435                     "org.chromium.Spaced",
1436                     "/org/chromium/Spaced",
1437                     DEFAULT_DBUS_TIMEOUT,
1438                 );
1439                 // If the input flags contain FS_PROJINHERIT_FL, then it is a set. Otherwise it is a
1440                 // reset.
1441                 let enable = (in_flags & FS_PROJINHERIT_FL) == FS_PROJINHERIT_FL;
1442                 let file_clone = base::SafeDescriptor::try_from(&*data)?;
1443                 match proxy.set_project_inheritance_flag(file_clone.into(), enable) {
1444                     Ok(r) => {
1445                         let r = SetProjectInheritanceFlagReply::parse_from_bytes(&r)
1446                             .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
1447                         if !r.success {
1448                             return Ok(IoctlReply::Done(Err(io::Error::from_raw_os_error(
1449                                 r.error,
1450                             ))));
1451                         }
1452                     }
1453                     Err(e) => {
1454                         return Err(io::Error::new(io::ErrorKind::Other, e));
1455                     }
1456                 };
1457             }
1458         }
1459 
1460         // SAFETY: this doesn't modify any memory and we check the return value.
1461         let res = unsafe { ioctl_with_ptr(&*data, FS_IOC_SETFLAGS(), &in_flags) };
1462         if res < 0 {
1463             Ok(IoctlReply::Done(Err(io::Error::last_os_error())))
1464         } else {
1465             Ok(IoctlReply::Done(Ok(Vec::new())))
1466         }
1467     }
1468 
enable_verity<R: io::Read>( &self, inode: Inode, handle: Handle, mut r: R, ) -> io::Result<IoctlReply>1469     fn enable_verity<R: io::Read>(
1470         &self,
1471         inode: Inode,
1472         handle: Handle,
1473         mut r: R,
1474     ) -> io::Result<IoctlReply> {
1475         let inode_data = self.find_inode(inode)?;
1476 
1477         // These match the return codes from `fsverity_ioctl_enable` in the kernel.
1478         match inode_data.filetype {
1479             FileType::Regular => {}
1480             FileType::Directory => return Err(io::Error::from_raw_os_error(libc::EISDIR)),
1481             FileType::Other => return Err(io::Error::from_raw_os_error(libc::EINVAL)),
1482         }
1483 
1484         {
1485             // We cannot enable verity while holding a writable fd so get a new one, if necessary.
1486             let mut file = inode_data.file.lock();
1487             let mut flags = file.1;
1488             match flags & libc::O_ACCMODE {
1489                 libc::O_WRONLY | libc::O_RDWR => {
1490                     flags &= !libc::O_ACCMODE;
1491                     flags |= libc::O_RDONLY;
1492 
1493                     // We need to get a read-only handle for this file.
1494                     let newfile = self.open_fd(file.0.as_raw_descriptor(), libc::O_RDONLY)?;
1495                     *file = (newfile, flags);
1496                 }
1497                 libc::O_RDONLY => {}
1498                 _ => panic!("Unexpected flags: {:#x}", flags),
1499             }
1500         }
1501 
1502         let data: Arc<dyn AsRawDescriptor> = if self.zero_message_open.load(Ordering::Relaxed) {
1503             inode_data
1504         } else {
1505             let data = self.find_handle(handle, inode)?;
1506 
1507             {
1508                 // We can't enable verity while holding a writable fd. We don't know whether the
1509                 // file was opened for writing so check it here. We don't expect
1510                 // this to be a frequent operation so the extra latency should be
1511                 // fine.
1512                 let mut file = data.file.lock();
1513                 let flags = FileFlags::from_file(&*file).map_err(io::Error::from)?;
1514                 match flags {
1515                     FileFlags::ReadWrite | FileFlags::Write => {
1516                         // We need to get a read-only handle for this file.
1517                         *file = self.open_fd(file.as_raw_descriptor(), libc::O_RDONLY)?;
1518                     }
1519                     FileFlags::Read => {}
1520                 }
1521             }
1522 
1523             data
1524         };
1525 
1526         let mut arg = fsverity_enable_arg::new_zeroed();
1527         r.read_exact(arg.as_bytes_mut())?;
1528 
1529         let mut salt;
1530         if arg.salt_size > 0 {
1531             if arg.salt_size > self.max_buffer_size() {
1532                 return Ok(IoctlReply::Done(Err(io::Error::from_raw_os_error(
1533                     libc::ENOMEM,
1534                 ))));
1535             }
1536             salt = vec![0; arg.salt_size as usize];
1537             r.read_exact(&mut salt)?;
1538             arg.salt_ptr = salt.as_ptr() as usize as u64;
1539         } else {
1540             arg.salt_ptr = 0;
1541         }
1542 
1543         let mut sig;
1544         if arg.sig_size > 0 {
1545             if arg.sig_size > self.max_buffer_size() {
1546                 return Ok(IoctlReply::Done(Err(io::Error::from_raw_os_error(
1547                     libc::ENOMEM,
1548                 ))));
1549             }
1550             sig = vec![0; arg.sig_size as usize];
1551             r.read_exact(&mut sig)?;
1552             arg.sig_ptr = sig.as_ptr() as usize as u64;
1553         } else {
1554             arg.sig_ptr = 0;
1555         }
1556 
1557         // SAFETY: this doesn't modify any memory and we check the return value.
1558         let res = unsafe { ioctl_with_ptr(&*data, FS_IOC_ENABLE_VERITY(), &arg) };
1559         if res < 0 {
1560             Ok(IoctlReply::Done(Err(io::Error::last_os_error())))
1561         } else {
1562             Ok(IoctlReply::Done(Ok(Vec::new())))
1563         }
1564     }
1565 
measure_verity<R: io::Read>( &self, inode: Inode, handle: Handle, mut r: R, out_size: u32, ) -> io::Result<IoctlReply>1566     fn measure_verity<R: io::Read>(
1567         &self,
1568         inode: Inode,
1569         handle: Handle,
1570         mut r: R,
1571         out_size: u32,
1572     ) -> io::Result<IoctlReply> {
1573         let data: Arc<dyn AsRawDescriptor> = if self.zero_message_open.load(Ordering::Relaxed) {
1574             self.find_inode(inode)?
1575         } else {
1576             self.find_handle(handle, inode)?
1577         };
1578 
1579         let mut digest = fsverity_digest::new_zeroed();
1580         r.read_exact(digest.as_bytes_mut())?;
1581 
1582         // Taken from fs/verity/fsverity_private.h.
1583         const FS_VERITY_MAX_DIGEST_SIZE: u16 = 64;
1584 
1585         // This digest size is what the fsverity command line utility uses.
1586         const DIGEST_SIZE: u16 = FS_VERITY_MAX_DIGEST_SIZE * 2 + 1;
1587         const BUFLEN: usize = size_of::<fsverity_digest>() + DIGEST_SIZE as usize;
1588         const ROUNDED_LEN: usize =
1589             (BUFLEN + size_of::<fsverity_digest>() - 1) / size_of::<fsverity_digest>();
1590 
1591         // Make sure we get a properly aligned allocation.
1592         let mut buf = [MaybeUninit::<fsverity_digest>::uninit(); ROUNDED_LEN];
1593 
1594         // SAFETY: we are only writing data and not reading uninitialized memory.
1595         unsafe {
1596             // TODO: Replace with `MaybeUninit::slice_as_mut_ptr` once it is stabilized.
1597             addr_of_mut!((*(buf.as_mut_ptr() as *mut fsverity_digest)).digest_size)
1598                 .write(DIGEST_SIZE)
1599         };
1600 
1601         // SAFETY: this will only modify `buf` and we check the return value.
1602         let res = unsafe { ioctl_with_mut_ptr(&*data, FS_IOC_MEASURE_VERITY(), buf.as_mut_ptr()) };
1603         if res < 0 {
1604             Ok(IoctlReply::Done(Err(io::Error::last_os_error())))
1605         } else {
1606             let digest_size =
1607                 // SAFETY: this value was initialized by us already and then overwritten by the kernel.
1608                 // TODO: Replace with `MaybeUninit::slice_as_ptr` once it is stabilized.
1609                 unsafe { addr_of!((*(buf.as_ptr() as *const fsverity_digest)).digest_size).read() };
1610             let outlen = size_of::<fsverity_digest>() as u32 + u32::from(digest_size);
1611 
1612             // The kernel guarantees this but it doesn't hurt to be paranoid.
1613             debug_assert!(outlen <= (ROUNDED_LEN * size_of::<fsverity_digest>()) as u32);
1614             if digest.digest_size < digest_size || out_size < outlen {
1615                 return Ok(IoctlReply::Done(Err(io::Error::from_raw_os_error(
1616                     libc::EOVERFLOW,
1617                 ))));
1618             }
1619 
1620             let buf: [MaybeUninit<u8>; ROUNDED_LEN * size_of::<fsverity_digest>()] =
1621                 // SAFETY: any bit pattern is valid for `MaybeUninit<u8>` and `fsverity_digest`
1622                 // doesn't contain any references.
1623                 unsafe { mem::transmute(buf) };
1624 
1625             let buf =
1626                 // SAFETY: Casting to `*const [u8]` is safe because the kernel guarantees that the
1627                 // first `outlen` bytes of `buf` are initialized and `MaybeUninit<u8>` is guaranteed
1628                 // to have the same layout as `u8`.
1629                 // TODO: Replace with `MaybeUninit::slice_assume_init_ref` once it is stabilized.
1630                 unsafe { &*(&buf[..outlen as usize] as *const [MaybeUninit<u8>] as *const [u8]) };
1631             Ok(IoctlReply::Done(Ok(buf.to_vec())))
1632         }
1633     }
1634 }
1635 
1636 /// Decrements the refcount of the inode.
1637 /// Returns `true` if the refcount became 0.
forget_one( inodes: &mut MultikeyBTreeMap<Inode, InodeAltKey, Arc<InodeData>>, inode: Inode, count: u64, ) -> bool1638 fn forget_one(
1639     inodes: &mut MultikeyBTreeMap<Inode, InodeAltKey, Arc<InodeData>>,
1640     inode: Inode,
1641     count: u64,
1642 ) -> bool {
1643     if let Some(data) = inodes.get(&inode) {
1644         // Acquiring the write lock on the inode map prevents new lookups from incrementing the
1645         // refcount but there is the possibility that a previous lookup already acquired a
1646         // reference to the inode data and is in the process of updating the refcount so we need
1647         // to loop here until we can decrement successfully.
1648         loop {
1649             let refcount = data.refcount.load(Ordering::Relaxed);
1650 
1651             // Saturating sub because it doesn't make sense for a refcount to go below zero and
1652             // we don't want misbehaving clients to cause integer overflow.
1653             let new_count = refcount.saturating_sub(count);
1654 
1655             // Synchronizes with the acquire load in `do_lookup`.
1656             if data
1657                 .refcount
1658                 .compare_exchange_weak(refcount, new_count, Ordering::Release, Ordering::Relaxed)
1659                 .is_ok()
1660             {
1661                 if new_count == 0 {
1662                     // We just removed the last refcount for this inode. There's no need for an
1663                     // acquire fence here because we hold a write lock on the inode map and any
1664                     // thread that is waiting to do a forget on the same inode will have to wait
1665                     // until we release the lock. So there's is no other release store for us to
1666                     // synchronize with before deleting the entry.
1667                     inodes.remove(&inode);
1668                     return true;
1669                 }
1670                 break;
1671             }
1672         }
1673     }
1674     false
1675 }
1676 
1677 // Strips any `user.virtiofs.` prefix from `buf`. If buf contains one or more nul-bytes, each
1678 // nul-byte-separated slice is treated as a C string and the prefix is stripped from each one.
strip_xattr_prefix(buf: &mut Vec<u8>)1679 fn strip_xattr_prefix(buf: &mut Vec<u8>) {
1680     fn next_cstr(b: &[u8], start: usize) -> Option<&[u8]> {
1681         if start >= b.len() {
1682             return None;
1683         }
1684 
1685         let end = b[start..]
1686             .iter()
1687             .position(|&c| c == b'\0')
1688             .map(|p| start + p + 1)
1689             .unwrap_or(b.len());
1690 
1691         Some(&b[start..end])
1692     }
1693 
1694     let mut pos = 0;
1695     while let Some(name) = next_cstr(buf, pos) {
1696         if !name.starts_with(USER_VIRTIOFS_XATTR) {
1697             pos += name.len();
1698             continue;
1699         }
1700 
1701         let newlen = name.len() - USER_VIRTIOFS_XATTR.len();
1702         buf.drain(pos..pos + USER_VIRTIOFS_XATTR.len());
1703         pos += newlen;
1704     }
1705 }
1706 
1707 impl FileSystem for PassthroughFs {
1708     type Inode = Inode;
1709     type Handle = Handle;
1710     type DirIter = ReadDir<Box<[u8]>>;
1711 
init(&self, capable: FsOptions) -> io::Result<FsOptions>1712     fn init(&self, capable: FsOptions) -> io::Result<FsOptions> {
1713         // SAFETY: this is a constant value that is a nul-terminated string without interior
1714         // nul bytes.
1715         let root = unsafe { CStr::from_bytes_with_nul_unchecked(ROOT_CSTR) };
1716 
1717         let flags = libc::O_DIRECTORY | libc::O_NOFOLLOW | libc::O_CLOEXEC;
1718         // SAFETY: this doesn't modify any memory and we check the return value.
1719         let raw_descriptor = unsafe { libc::openat64(libc::AT_FDCWD, root.as_ptr(), flags) };
1720         if raw_descriptor < 0 {
1721             return Err(io::Error::last_os_error());
1722         }
1723 
1724         // SAFETY: safe because we just opened this descriptor above.
1725         let f = unsafe { File::from_raw_descriptor(raw_descriptor) };
1726 
1727         let st = stat(&f)?;
1728 
1729         // SAFETY: this doesn't modify any memory and there is no need to check the return
1730         // value because this system call always succeeds. We need to clear the umask here because
1731         // we want the client to be able to set all the bits in the mode.
1732         unsafe { libc::umask(0o000) };
1733 
1734         let mut inodes = self.inodes.lock();
1735 
1736         // Not sure why the root inode gets a refcount of 2 but that's what libfuse does.
1737         inodes.insert(
1738             ROOT_ID,
1739             InodeAltKey {
1740                 ino: st.st_ino,
1741                 dev: st.st_dev,
1742             },
1743             Arc::new(InodeData {
1744                 inode: ROOT_ID,
1745                 file: Mutex::new((f, flags)),
1746                 refcount: AtomicU64::new(2),
1747                 filetype: st.st_mode.into(),
1748                 path: "".to_string(),
1749             }),
1750         );
1751 
1752         let mut opts = FsOptions::DO_READDIRPLUS
1753             | FsOptions::READDIRPLUS_AUTO
1754             | FsOptions::EXPORT_SUPPORT
1755             | FsOptions::DONT_MASK
1756             | FsOptions::CACHE_SYMLINKS
1757             | FsOptions::SECURITY_CONTEXT;
1758         if self.cfg.posix_acl {
1759             opts |= FsOptions::POSIX_ACL;
1760         }
1761         if self.cfg.writeback && capable.contains(FsOptions::WRITEBACK_CACHE) {
1762             opts |= FsOptions::WRITEBACK_CACHE;
1763             self.writeback.store(true, Ordering::Relaxed);
1764         }
1765         if self.cfg.cache_policy == CachePolicy::Always {
1766             if capable.contains(FsOptions::ZERO_MESSAGE_OPEN) {
1767                 opts |= FsOptions::ZERO_MESSAGE_OPEN;
1768                 self.zero_message_open.store(true, Ordering::Relaxed);
1769             }
1770             if capable.contains(FsOptions::ZERO_MESSAGE_OPENDIR) {
1771                 opts |= FsOptions::ZERO_MESSAGE_OPENDIR;
1772                 self.zero_message_opendir.store(true, Ordering::Relaxed);
1773             }
1774         }
1775         Ok(opts)
1776     }
1777 
destroy(&self)1778     fn destroy(&self) {
1779         cros_tracing::trace_simple_print!(VirtioFs, "{:?}: destroy", self);
1780         self.handles.lock().clear();
1781         self.inodes.lock().clear();
1782     }
1783 
statfs(&self, _ctx: Context, inode: Inode) -> io::Result<libc::statvfs64>1784     fn statfs(&self, _ctx: Context, inode: Inode) -> io::Result<libc::statvfs64> {
1785         let _trace = fs_trace!(self.tag, "statfs", inode);
1786         let data = self.find_inode(inode)?;
1787 
1788         let mut out = MaybeUninit::<libc::statvfs64>::zeroed();
1789 
1790         // SAFETY: this will only modify `out` and we check the return value.
1791         syscall!(unsafe { libc::fstatvfs64(data.as_raw_descriptor(), out.as_mut_ptr()) })?;
1792 
1793         // SAFETY: the kernel guarantees that `out` has been initialized.
1794         Ok(unsafe { out.assume_init() })
1795     }
1796 
lookup(&self, _ctx: Context, parent: Inode, name: &CStr) -> io::Result<Entry>1797     fn lookup(&self, _ctx: Context, parent: Inode, name: &CStr) -> io::Result<Entry> {
1798         let data = self.find_inode(parent)?;
1799         #[allow(unused_variables)]
1800         let path = format!(
1801             "{}/{}",
1802             data.path,
1803             name.to_str().unwrap_or("<non UTF-8 path>")
1804         );
1805         let _trace = fs_trace!(self.tag, "lookup", parent, path);
1806 
1807         let mut res = self.do_lookup_with_casefold_fallback(&data, name);
1808 
1809         // FUSE takes a inode=0 as a request to do negative dentry cache.
1810         // So, if `negative_timeout` is set, return success with the timeout value and inode=0 as a
1811         // response.
1812         if let Err(e) = &res {
1813             if e.kind() == std::io::ErrorKind::NotFound && !self.cfg.negative_timeout.is_zero() {
1814                 res = Ok(Entry::new_negative(self.cfg.negative_timeout));
1815             }
1816         }
1817 
1818         res
1819     }
1820 
forget(&self, _ctx: Context, inode: Inode, count: u64)1821     fn forget(&self, _ctx: Context, inode: Inode, count: u64) {
1822         let _trace = fs_trace!(self.tag, "forget", inode, count);
1823         let mut inodes = self.inodes.lock();
1824         let caches = self.lock_casefold_lookup_caches();
1825         if forget_one(&mut inodes, inode, count) {
1826             if let Some(mut c) = caches {
1827                 c.forget(inode);
1828             }
1829         }
1830     }
1831 
batch_forget(&self, _ctx: Context, requests: Vec<(Inode, u64)>)1832     fn batch_forget(&self, _ctx: Context, requests: Vec<(Inode, u64)>) {
1833         let mut inodes = self.inodes.lock();
1834         let mut caches = self.lock_casefold_lookup_caches();
1835         for (inode, count) in requests {
1836             if forget_one(&mut inodes, inode, count) {
1837                 if let Some(c) = caches.as_mut() {
1838                     c.forget(inode);
1839                 }
1840             }
1841         }
1842     }
1843 
opendir( &self, _ctx: Context, inode: Inode, flags: u32, ) -> io::Result<(Option<Handle>, OpenOptions)>1844     fn opendir(
1845         &self,
1846         _ctx: Context,
1847         inode: Inode,
1848         flags: u32,
1849     ) -> io::Result<(Option<Handle>, OpenOptions)> {
1850         let _trace = fs_trace!(self.tag, "opendir", inode, flags);
1851         if self.zero_message_opendir.load(Ordering::Relaxed) {
1852             Err(io::Error::from_raw_os_error(libc::ENOSYS))
1853         } else {
1854             self.do_open(inode, flags | (libc::O_DIRECTORY as u32))
1855         }
1856     }
1857 
releasedir( &self, _ctx: Context, inode: Inode, _flags: u32, handle: Handle, ) -> io::Result<()>1858     fn releasedir(
1859         &self,
1860         _ctx: Context,
1861         inode: Inode,
1862         _flags: u32,
1863         handle: Handle,
1864     ) -> io::Result<()> {
1865         let _trace = fs_trace!(self.tag, "releasedir", inode, handle);
1866         if self.zero_message_opendir.load(Ordering::Relaxed) {
1867             Ok(())
1868         } else {
1869             self.do_release(inode, handle)
1870         }
1871     }
1872 
mkdir( &self, ctx: Context, parent: Inode, name: &CStr, mode: u32, umask: u32, security_ctx: Option<&CStr>, ) -> io::Result<Entry>1873     fn mkdir(
1874         &self,
1875         ctx: Context,
1876         parent: Inode,
1877         name: &CStr,
1878         mode: u32,
1879         umask: u32,
1880         security_ctx: Option<&CStr>,
1881     ) -> io::Result<Entry> {
1882         let _trace = fs_trace!(self.tag, "mkdir", parent, name, mode, umask, security_ctx);
1883         let data = self.find_inode(parent)?;
1884 
1885         let _ctx = security_ctx
1886             .filter(|ctx| ctx.to_bytes_with_nul() != UNLABELED_CSTR)
1887             .map(|ctx| ScopedSecurityContext::new(&self.proc, ctx))
1888             .transpose()?;
1889 
1890         let (_uid, _gid) = set_creds(ctx.uid, ctx.gid)?;
1891         {
1892             let casefold_cache = self.lock_casefold_lookup_caches();
1893             let _scoped_umask = ScopedUmask::new(umask);
1894 
1895             // SAFETY: this doesn't modify any memory and we check the return value.
1896             syscall!(unsafe { libc::mkdirat(data.as_raw_descriptor(), name.as_ptr(), mode) })?;
1897             if let Some(mut c) = casefold_cache {
1898                 c.insert(data.inode, name);
1899             }
1900         }
1901         self.do_lookup(&data, name)
1902     }
1903 
rmdir(&self, _ctx: Context, parent: Inode, name: &CStr) -> io::Result<()>1904     fn rmdir(&self, _ctx: Context, parent: Inode, name: &CStr) -> io::Result<()> {
1905         let _trace = fs_trace!(self.tag, "rmdir", parent, name);
1906         let data = self.find_inode(parent)?;
1907         let casefold_cache = self.lock_casefold_lookup_caches();
1908         // TODO(b/278691962): If ascii_casefold is enabled, we need to call
1909         // `get_case_unfolded_name()` to get the actual name to be unlinked.
1910         self.do_unlink(&data, name, libc::AT_REMOVEDIR)?;
1911         if let Some(mut c) = casefold_cache {
1912             c.remove(data.inode, name);
1913         }
1914         Ok(())
1915     }
1916 
readdir( &self, _ctx: Context, inode: Inode, handle: Handle, size: u32, offset: u64, ) -> io::Result<Self::DirIter>1917     fn readdir(
1918         &self,
1919         _ctx: Context,
1920         inode: Inode,
1921         handle: Handle,
1922         size: u32,
1923         offset: u64,
1924     ) -> io::Result<Self::DirIter> {
1925         let _trace = fs_trace!(self.tag, "readdir", inode, handle, size, offset);
1926         let buf = vec![0; size as usize].into_boxed_slice();
1927 
1928         if self.zero_message_opendir.load(Ordering::Relaxed) {
1929             let data = self.find_inode(inode)?;
1930             ReadDir::new(&*data, offset as libc::off64_t, buf)
1931         } else {
1932             let data = self.find_handle(handle, inode)?;
1933 
1934             let dir = data.file.lock();
1935 
1936             ReadDir::new(&*dir, offset as libc::off64_t, buf)
1937         }
1938     }
1939 
open( &self, _ctx: Context, inode: Inode, flags: u32, ) -> io::Result<(Option<Handle>, OpenOptions)>1940     fn open(
1941         &self,
1942         _ctx: Context,
1943         inode: Inode,
1944         flags: u32,
1945     ) -> io::Result<(Option<Handle>, OpenOptions)> {
1946         if self.zero_message_open.load(Ordering::Relaxed) {
1947             let _trace = fs_trace!(self.tag, "open (zero-message)", inode, flags);
1948             Err(io::Error::from_raw_os_error(libc::ENOSYS))
1949         } else {
1950             let _trace = fs_trace!(self.tag, "open", inode, flags);
1951             self.do_open(inode, flags)
1952         }
1953     }
1954 
release( &self, _ctx: Context, inode: Inode, _flags: u32, handle: Handle, _flush: bool, _flock_release: bool, _lock_owner: Option<u64>, ) -> io::Result<()>1955     fn release(
1956         &self,
1957         _ctx: Context,
1958         inode: Inode,
1959         _flags: u32,
1960         handle: Handle,
1961         _flush: bool,
1962         _flock_release: bool,
1963         _lock_owner: Option<u64>,
1964     ) -> io::Result<()> {
1965         if self.zero_message_open.load(Ordering::Relaxed) {
1966             let _trace = fs_trace!(self.tag, "release (zero-message)", inode, handle);
1967             Ok(())
1968         } else {
1969             let _trace = fs_trace!(self.tag, "release", inode, handle);
1970             self.do_release(inode, handle)
1971         }
1972     }
1973 
chromeos_tmpfile( &self, ctx: Context, parent: Self::Inode, mode: u32, umask: u32, security_ctx: Option<&CStr>, ) -> io::Result<Entry>1974     fn chromeos_tmpfile(
1975         &self,
1976         ctx: Context,
1977         parent: Self::Inode,
1978         mode: u32,
1979         umask: u32,
1980         security_ctx: Option<&CStr>,
1981     ) -> io::Result<Entry> {
1982         let _trace = fs_trace!(
1983             self.tag,
1984             "chromeos_tempfile",
1985             parent,
1986             mode,
1987             umask,
1988             security_ctx
1989         );
1990         let data = self.find_inode(parent)?;
1991 
1992         let _ctx = security_ctx
1993             .filter(|ctx| ctx.to_bytes_with_nul() != UNLABELED_CSTR)
1994             .map(|ctx| ScopedSecurityContext::new(&self.proc, ctx))
1995             .transpose()?;
1996 
1997         let (_uid, _gid) = set_creds(ctx.uid, ctx.gid)?;
1998 
1999         let tmpflags = libc::O_RDWR | libc::O_TMPFILE | libc::O_CLOEXEC | libc::O_NOFOLLOW;
2000 
2001         // SAFETY: This string is nul-terminated and does not contain any interior nul bytes
2002         let current_dir = unsafe { CStr::from_bytes_with_nul_unchecked(b".\0") };
2003 
2004         let fd = {
2005             let _scoped_umask = ScopedUmask::new(umask);
2006 
2007             // SAFETY: this doesn't modify any memory and we check the return value.
2008             syscall!(unsafe {
2009                 libc::openat64(
2010                     data.as_raw_descriptor(),
2011                     current_dir.as_ptr(),
2012                     tmpflags,
2013                     mode,
2014                 )
2015             })?
2016         };
2017         // No need to add casefold_cache becuase we created an anonymous file.
2018 
2019         // SAFETY: safe because we just opened this fd.
2020         let tmpfile = unsafe { File::from_raw_descriptor(fd) };
2021 
2022         let st = stat(&tmpfile)?;
2023         let path = format!(
2024             "{}/{}",
2025             data.path.clone(),
2026             current_dir.to_str().unwrap_or("<non UTF-8 str>")
2027         );
2028         Ok(self.add_entry(tmpfile, st, tmpflags, path))
2029     }
2030 
create( &self, ctx: Context, parent: Inode, name: &CStr, mode: u32, flags: u32, umask: u32, security_ctx: Option<&CStr>, ) -> io::Result<(Entry, Option<Handle>, OpenOptions)>2031     fn create(
2032         &self,
2033         ctx: Context,
2034         parent: Inode,
2035         name: &CStr,
2036         mode: u32,
2037         flags: u32,
2038         umask: u32,
2039         security_ctx: Option<&CStr>,
2040     ) -> io::Result<(Entry, Option<Handle>, OpenOptions)> {
2041         let _trace = fs_trace!(
2042             self.tag,
2043             "create",
2044             parent,
2045             name,
2046             mode,
2047             flags,
2048             umask,
2049             security_ctx
2050         );
2051         let data = self.find_inode(parent)?;
2052 
2053         let _ctx = security_ctx
2054             .filter(|ctx| ctx.to_bytes_with_nul() != UNLABELED_CSTR)
2055             .map(|ctx| ScopedSecurityContext::new(&self.proc, ctx))
2056             .transpose()?;
2057 
2058         let (_uid, _gid) = set_creds(ctx.uid, ctx.gid)?;
2059 
2060         let create_flags =
2061             (flags as i32 | libc::O_CREAT | libc::O_CLOEXEC | libc::O_NOFOLLOW) & !libc::O_DIRECT;
2062 
2063         let fd = {
2064             let _scoped_umask = ScopedUmask::new(umask);
2065             let casefold_cache = self.lock_casefold_lookup_caches();
2066 
2067             // SAFETY: this doesn't modify any memory and we check the return value. We don't really
2068             // check `flags` because if the kernel can't handle poorly specified flags then we have
2069             // much bigger problems.
2070             // TODO(b/278691962): If ascii_casefold is enabled, we need to call
2071             // `get_case_unfolded_name()` to get the actual name to be created.
2072             let fd = syscall!(unsafe {
2073                 libc::openat64(data.as_raw_descriptor(), name.as_ptr(), create_flags, mode)
2074             })?;
2075             if let Some(mut c) = casefold_cache {
2076                 c.insert(parent, name);
2077             }
2078             fd
2079         };
2080 
2081         // SAFETY: safe because we just opened this fd.
2082         let file = unsafe { File::from_raw_descriptor(fd) };
2083 
2084         let st = stat(&file)?;
2085         let path = format!(
2086             "{}/{}",
2087             data.path.clone(),
2088             name.to_str().unwrap_or("<non UTF-8 str>")
2089         );
2090         let entry = self.add_entry(file, st, create_flags, path);
2091 
2092         let (handle, opts) = if self.zero_message_open.load(Ordering::Relaxed) {
2093             (None, OpenOptions::KEEP_CACHE)
2094         } else {
2095             self.do_open_at(
2096                 data,
2097                 name,
2098                 entry.inode,
2099                 flags & !((libc::O_CREAT | libc::O_EXCL | libc::O_NOCTTY) as u32),
2100             )
2101             .map_err(|e| {
2102                 // Don't leak the entry.
2103                 self.forget(ctx, entry.inode, 1);
2104                 e
2105             })?
2106         };
2107         Ok((entry, handle, opts))
2108     }
2109 
unlink(&self, _ctx: Context, parent: Inode, name: &CStr) -> io::Result<()>2110     fn unlink(&self, _ctx: Context, parent: Inode, name: &CStr) -> io::Result<()> {
2111         let _trace = fs_trace!(self.tag, "unlink", parent, name);
2112         let data = self.find_inode(parent)?;
2113         let casefold_cache = self.lock_casefold_lookup_caches();
2114         // TODO(b/278691962): If ascii_casefold is enabled, we need to call
2115         // `get_case_unfolded_name()` to get the actual name to be unlinked.
2116         self.do_unlink(&data, name, 0)?;
2117         if let Some(mut c) = casefold_cache {
2118             c.remove(data.inode, name);
2119         }
2120         Ok(())
2121     }
2122 
read<W: io::Write + ZeroCopyWriter>( &self, _ctx: Context, inode: Inode, handle: Handle, mut w: W, size: u32, offset: u64, _lock_owner: Option<u64>, _flags: u32, ) -> io::Result<usize>2123     fn read<W: io::Write + ZeroCopyWriter>(
2124         &self,
2125         _ctx: Context,
2126         inode: Inode,
2127         handle: Handle,
2128         mut w: W,
2129         size: u32,
2130         offset: u64,
2131         _lock_owner: Option<u64>,
2132         _flags: u32,
2133     ) -> io::Result<usize> {
2134         if self.zero_message_open.load(Ordering::Relaxed) {
2135             let _trace = fs_trace!(self.tag, "read (zero-message)", inode, handle, size, offset);
2136             let data = self.find_inode(inode)?;
2137 
2138             let mut file = data.file.lock();
2139             let mut flags = file.1;
2140             match flags & libc::O_ACCMODE {
2141                 libc::O_WRONLY => {
2142                     flags &= !libc::O_WRONLY;
2143                     flags |= libc::O_RDWR;
2144 
2145                     // We need to get a readable handle for this file.
2146                     let newfile = self.open_fd(file.0.as_raw_descriptor(), libc::O_RDWR)?;
2147                     *file = (newfile, flags);
2148                 }
2149                 libc::O_RDONLY | libc::O_RDWR => {}
2150                 _ => panic!("Unexpected flags: {:#x}", flags),
2151             }
2152 
2153             w.write_from(&mut file.0, size as usize, offset)
2154         } else {
2155             let _trace = fs_trace!(self.tag, "read", inode, handle, size, offset);
2156             let data = self.find_handle(handle, inode)?;
2157 
2158             let mut f = data.file.lock();
2159             w.write_from(&mut f, size as usize, offset)
2160         }
2161     }
2162 
write<R: io::Read + ZeroCopyReader>( &self, _ctx: Context, inode: Inode, handle: Handle, mut r: R, size: u32, offset: u64, _lock_owner: Option<u64>, _delayed_write: bool, flags: u32, ) -> io::Result<usize>2163     fn write<R: io::Read + ZeroCopyReader>(
2164         &self,
2165         _ctx: Context,
2166         inode: Inode,
2167         handle: Handle,
2168         mut r: R,
2169         size: u32,
2170         offset: u64,
2171         _lock_owner: Option<u64>,
2172         _delayed_write: bool,
2173         flags: u32,
2174     ) -> io::Result<usize> {
2175         // When the WRITE_KILL_PRIV flag is set, drop CAP_FSETID so that the kernel will
2176         // automatically clear the setuid and setgid bits for us.
2177         let _fsetid = if flags & WRITE_KILL_PRIV != 0 {
2178             Some(drop_cap_fsetid()?)
2179         } else {
2180             None
2181         };
2182 
2183         if self.zero_message_open.load(Ordering::Relaxed) {
2184             let _trace = fs_trace!(
2185                 self.tag,
2186                 "write (zero-message)",
2187                 inode,
2188                 handle,
2189                 size,
2190                 offset
2191             );
2192 
2193             let data = self.find_inode(inode)?;
2194 
2195             let mut file = data.file.lock();
2196             let mut flags = file.1;
2197             match flags & libc::O_ACCMODE {
2198                 libc::O_RDONLY => {
2199                     flags &= !libc::O_RDONLY;
2200                     flags |= libc::O_RDWR;
2201 
2202                     // We need to get a writable handle for this file.
2203                     let newfile = self.open_fd(file.0.as_raw_descriptor(), libc::O_RDWR)?;
2204                     *file = (newfile, flags);
2205                 }
2206                 libc::O_WRONLY | libc::O_RDWR => {}
2207                 _ => panic!("Unexpected flags: {:#x}", flags),
2208             }
2209 
2210             r.read_to(&mut file.0, size as usize, offset)
2211         } else {
2212             let _trace = fs_trace!(self.tag, "write", inode, handle, size, offset);
2213 
2214             let data = self.find_handle(handle, inode)?;
2215 
2216             let mut f = data.file.lock();
2217             r.read_to(&mut f, size as usize, offset)
2218         }
2219     }
2220 
getattr( &self, _ctx: Context, inode: Inode, _handle: Option<Handle>, ) -> io::Result<(libc::stat64, Duration)>2221     fn getattr(
2222         &self,
2223         _ctx: Context,
2224         inode: Inode,
2225         _handle: Option<Handle>,
2226     ) -> io::Result<(libc::stat64, Duration)> {
2227         let _trace = fs_trace!(self.tag, "getattr", inode, _handle);
2228 
2229         let data = self.find_inode(inode)?;
2230         self.do_getattr(&data)
2231     }
2232 
setattr( &self, _ctx: Context, inode: Inode, attr: libc::stat64, handle: Option<Handle>, valid: SetattrValid, ) -> io::Result<(libc::stat64, Duration)>2233     fn setattr(
2234         &self,
2235         _ctx: Context,
2236         inode: Inode,
2237         attr: libc::stat64,
2238         handle: Option<Handle>,
2239         valid: SetattrValid,
2240     ) -> io::Result<(libc::stat64, Duration)> {
2241         let _trace = fs_trace!(self.tag, "setattr", inode, handle);
2242         let inode_data = self.find_inode(inode)?;
2243 
2244         enum Data {
2245             Handle(Arc<HandleData>, RawDescriptor),
2246             ProcPath(CString),
2247         }
2248 
2249         // If we have a handle then use it otherwise get a new fd from the inode.
2250         let data = if let Some(handle) = handle.filter(|&h| h != 0) {
2251             let hd = self.find_handle(handle, inode)?;
2252 
2253             let fd = hd.file.lock().as_raw_descriptor();
2254             Data::Handle(hd, fd)
2255         } else {
2256             let pathname = CString::new(format!("self/fd/{}", inode_data.as_raw_descriptor()))
2257                 .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
2258             Data::ProcPath(pathname)
2259         };
2260 
2261         if valid.contains(SetattrValid::MODE) {
2262             // SAFETY: this doesn't modify any memory and we check the return value.
2263             syscall!(unsafe {
2264                 match data {
2265                     Data::Handle(_, fd) => libc::fchmod(fd, attr.st_mode),
2266                     Data::ProcPath(ref p) => {
2267                         libc::fchmodat(self.proc.as_raw_descriptor(), p.as_ptr(), attr.st_mode, 0)
2268                     }
2269                 }
2270             })?;
2271         }
2272 
2273         if valid.intersects(SetattrValid::UID | SetattrValid::GID) {
2274             let uid = if valid.contains(SetattrValid::UID) {
2275                 attr.st_uid
2276             } else {
2277                 // Cannot use -1 here because these are unsigned values.
2278                 ::std::u32::MAX
2279             };
2280             let gid = if valid.contains(SetattrValid::GID) {
2281                 attr.st_gid
2282             } else {
2283                 // Cannot use -1 here because these are unsigned values.
2284                 ::std::u32::MAX
2285             };
2286 
2287             // SAFETY: this is a constant value that is a nul-terminated string without interior
2288             // nul bytes.
2289             let empty = unsafe { CStr::from_bytes_with_nul_unchecked(EMPTY_CSTR) };
2290 
2291             // SAFETY: this doesn't modify any memory and we check the return value.
2292             syscall!(unsafe {
2293                 libc::fchownat(
2294                     inode_data.as_raw_descriptor(),
2295                     empty.as_ptr(),
2296                     uid,
2297                     gid,
2298                     libc::AT_EMPTY_PATH | libc::AT_SYMLINK_NOFOLLOW,
2299                 )
2300             })?;
2301         }
2302 
2303         if valid.contains(SetattrValid::SIZE) {
2304             syscall!(match data {
2305                 Data::Handle(_, fd) => {
2306                     // SAFETY: this doesn't modify any memory and we check the return value.
2307                     unsafe { libc::ftruncate64(fd, attr.st_size) }
2308                 }
2309                 _ => {
2310                     // There is no `ftruncateat` so we need to get a new fd and truncate it.
2311                     let f = self.open_inode(&inode_data, libc::O_NONBLOCK | libc::O_RDWR)?;
2312                     // SAFETY: this doesn't modify any memory and we check the return value.
2313                     unsafe { libc::ftruncate64(f.as_raw_descriptor(), attr.st_size) }
2314                 }
2315             })?;
2316         }
2317 
2318         if valid.intersects(SetattrValid::ATIME | SetattrValid::MTIME) {
2319             let mut tvs = [
2320                 libc::timespec {
2321                     tv_sec: 0,
2322                     tv_nsec: libc::UTIME_OMIT,
2323                 },
2324                 libc::timespec {
2325                     tv_sec: 0,
2326                     tv_nsec: libc::UTIME_OMIT,
2327                 },
2328             ];
2329 
2330             if valid.contains(SetattrValid::ATIME_NOW) {
2331                 tvs[0].tv_nsec = libc::UTIME_NOW;
2332             } else if valid.contains(SetattrValid::ATIME) {
2333                 tvs[0].tv_sec = attr.st_atime;
2334                 tvs[0].tv_nsec = attr.st_atime_nsec;
2335             }
2336 
2337             if valid.contains(SetattrValid::MTIME_NOW) {
2338                 tvs[1].tv_nsec = libc::UTIME_NOW;
2339             } else if valid.contains(SetattrValid::MTIME) {
2340                 tvs[1].tv_sec = attr.st_mtime;
2341                 tvs[1].tv_nsec = attr.st_mtime_nsec;
2342             }
2343 
2344             // SAFETY: this doesn't modify any memory and we check the return value.
2345             syscall!(unsafe {
2346                 match data {
2347                     Data::Handle(_, fd) => libc::futimens(fd, tvs.as_ptr()),
2348                     Data::ProcPath(ref p) => {
2349                         libc::utimensat(self.proc.as_raw_descriptor(), p.as_ptr(), tvs.as_ptr(), 0)
2350                     }
2351                 }
2352             })?;
2353         }
2354 
2355         self.do_getattr(&inode_data)
2356     }
2357 
rename( &self, _ctx: Context, olddir: Inode, oldname: &CStr, newdir: Inode, newname: &CStr, flags: u32, ) -> io::Result<()>2358     fn rename(
2359         &self,
2360         _ctx: Context,
2361         olddir: Inode,
2362         oldname: &CStr,
2363         newdir: Inode,
2364         newname: &CStr,
2365         flags: u32,
2366     ) -> io::Result<()> {
2367         let _trace = fs_trace!(self.tag, "rename", olddir, oldname, newdir, newname, flags);
2368 
2369         let old_inode = self.find_inode(olddir)?;
2370         let new_inode = self.find_inode(newdir)?;
2371         {
2372             let casefold_cache = self.lock_casefold_lookup_caches();
2373 
2374             // SAFETY: this doesn't modify any memory and we check the return value.
2375             // TODO: Switch to libc::renameat2 once https://github.com/rust-lang/libc/pull/1508 lands
2376             // and we have glibc 2.28.
2377             syscall!(unsafe {
2378                 libc::syscall(
2379                     libc::SYS_renameat2,
2380                     old_inode.as_raw_descriptor(),
2381                     oldname.as_ptr(),
2382                     new_inode.as_raw_descriptor(),
2383                     newname.as_ptr(),
2384                     flags,
2385                 )
2386             })?;
2387             if let Some(mut c) = casefold_cache {
2388                 c.remove(olddir, oldname);
2389                 c.insert(newdir, newname);
2390             }
2391         }
2392 
2393         Ok(())
2394     }
2395 
mknod( &self, ctx: Context, parent: Inode, name: &CStr, mode: u32, rdev: u32, umask: u32, security_ctx: Option<&CStr>, ) -> io::Result<Entry>2396     fn mknod(
2397         &self,
2398         ctx: Context,
2399         parent: Inode,
2400         name: &CStr,
2401         mode: u32,
2402         rdev: u32,
2403         umask: u32,
2404         security_ctx: Option<&CStr>,
2405     ) -> io::Result<Entry> {
2406         let _trace = fs_trace!(
2407             self.tag,
2408             "mknod",
2409             parent,
2410             name,
2411             mode,
2412             rdev,
2413             umask,
2414             security_ctx
2415         );
2416         let data = self.find_inode(parent)?;
2417 
2418         let _ctx = security_ctx
2419             .filter(|ctx| ctx.to_bytes_with_nul() != UNLABELED_CSTR)
2420             .map(|ctx| ScopedSecurityContext::new(&self.proc, ctx))
2421             .transpose()?;
2422 
2423         let (_uid, _gid) = set_creds(ctx.uid, ctx.gid)?;
2424         {
2425             let _scoped_umask = ScopedUmask::new(umask);
2426             let casefold_cache = self.lock_casefold_lookup_caches();
2427 
2428             // SAFETY: this doesn't modify any memory and we check the return value.
2429             syscall!(unsafe {
2430                 libc::mknodat(
2431                     data.as_raw_descriptor(),
2432                     name.as_ptr(),
2433                     mode as libc::mode_t,
2434                     rdev as libc::dev_t,
2435                 )
2436             })?;
2437             if let Some(mut c) = casefold_cache {
2438                 c.insert(parent, name);
2439             }
2440         }
2441 
2442         self.do_lookup(&data, name)
2443     }
2444 
link( &self, _ctx: Context, inode: Inode, newparent: Inode, newname: &CStr, ) -> io::Result<Entry>2445     fn link(
2446         &self,
2447         _ctx: Context,
2448         inode: Inode,
2449         newparent: Inode,
2450         newname: &CStr,
2451     ) -> io::Result<Entry> {
2452         let _trace = fs_trace!(self.tag, "link", inode, newparent, newname);
2453         let data = self.find_inode(inode)?;
2454         let new_inode = self.find_inode(newparent)?;
2455 
2456         let path = CString::new(format!("self/fd/{}", data.as_raw_descriptor()))
2457             .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
2458 
2459         {
2460             let casefold_cache = self.lock_casefold_lookup_caches();
2461             // SAFETY: this doesn't modify any memory and we check the return value.
2462             syscall!(unsafe {
2463                 libc::linkat(
2464                     self.proc.as_raw_descriptor(),
2465                     path.as_ptr(),
2466                     new_inode.as_raw_descriptor(),
2467                     newname.as_ptr(),
2468                     libc::AT_SYMLINK_FOLLOW,
2469                 )
2470             })?;
2471             if let Some(mut c) = casefold_cache {
2472                 c.insert(newparent, newname);
2473             }
2474         }
2475 
2476         self.do_lookup(&new_inode, newname)
2477     }
2478 
symlink( &self, ctx: Context, linkname: &CStr, parent: Inode, name: &CStr, security_ctx: Option<&CStr>, ) -> io::Result<Entry>2479     fn symlink(
2480         &self,
2481         ctx: Context,
2482         linkname: &CStr,
2483         parent: Inode,
2484         name: &CStr,
2485         security_ctx: Option<&CStr>,
2486     ) -> io::Result<Entry> {
2487         let _trace = fs_trace!(self.tag, "symlink", parent, linkname, name, security_ctx);
2488         let data = self.find_inode(parent)?;
2489 
2490         let _ctx = security_ctx
2491             .filter(|ctx| ctx.to_bytes_with_nul() != UNLABELED_CSTR)
2492             .map(|ctx| ScopedSecurityContext::new(&self.proc, ctx))
2493             .transpose()?;
2494 
2495         let (_uid, _gid) = set_creds(ctx.uid, ctx.gid)?;
2496         {
2497             let casefold_cache = self.lock_casefold_lookup_caches();
2498             // SAFETY: this doesn't modify any memory and we check the return value.
2499             syscall!(unsafe {
2500                 libc::symlinkat(linkname.as_ptr(), data.as_raw_descriptor(), name.as_ptr())
2501             })?;
2502             if let Some(mut c) = casefold_cache {
2503                 c.insert(parent, name);
2504             }
2505         }
2506 
2507         self.do_lookup(&data, name)
2508     }
2509 
readlink(&self, _ctx: Context, inode: Inode) -> io::Result<Vec<u8>>2510     fn readlink(&self, _ctx: Context, inode: Inode) -> io::Result<Vec<u8>> {
2511         let _trace = fs_trace!(self.tag, "readlink", inode);
2512         let data = self.find_inode(inode)?;
2513 
2514         let mut buf = vec![0; libc::PATH_MAX as usize];
2515 
2516         // SAFETY: this is a constant value that is a nul-terminated string without interior nul
2517         // bytes.
2518         let empty = unsafe { CStr::from_bytes_with_nul_unchecked(EMPTY_CSTR) };
2519 
2520         // SAFETY: this will only modify the contents of `buf` and we check the return value.
2521         let res = syscall!(unsafe {
2522             libc::readlinkat(
2523                 data.as_raw_descriptor(),
2524                 empty.as_ptr(),
2525                 buf.as_mut_ptr() as *mut libc::c_char,
2526                 buf.len(),
2527             )
2528         })?;
2529 
2530         buf.resize(res as usize, 0);
2531         Ok(buf)
2532     }
2533 
flush( &self, _ctx: Context, inode: Inode, handle: Handle, _lock_owner: u64, ) -> io::Result<()>2534     fn flush(
2535         &self,
2536         _ctx: Context,
2537         inode: Inode,
2538         handle: Handle,
2539         _lock_owner: u64,
2540     ) -> io::Result<()> {
2541         let _trace = fs_trace!(self.tag, "flush", inode, handle);
2542         let data: Arc<dyn AsRawDescriptor> = if self.zero_message_open.load(Ordering::Relaxed) {
2543             self.find_inode(inode)?
2544         } else {
2545             self.find_handle(handle, inode)?
2546         };
2547 
2548         // SAFETY:
2549         // Since this method is called whenever an fd is closed in the client, we can emulate that
2550         // behavior by doing the same thing (dup-ing the fd and then immediately closing it). Safe
2551         // because this doesn't modify any memory and we check the return values.
2552         unsafe {
2553             let newfd = syscall!(libc::fcntl(
2554                 data.as_raw_descriptor(),
2555                 libc::F_DUPFD_CLOEXEC,
2556                 0
2557             ))?;
2558 
2559             syscall!(libc::close(newfd))?;
2560         }
2561         Ok(())
2562     }
2563 
fsync(&self, _ctx: Context, inode: Inode, datasync: bool, handle: Handle) -> io::Result<()>2564     fn fsync(&self, _ctx: Context, inode: Inode, datasync: bool, handle: Handle) -> io::Result<()> {
2565         if self.zero_message_open.load(Ordering::Relaxed) {
2566             let _trace = fs_trace!(self.tag, "fsync (zero-message)", inode, datasync, handle);
2567             let data = self.find_inode(inode)?;
2568             self.do_fsync(&*data, datasync)
2569         } else {
2570             let _trace = fs_trace!(self.tag, "fsync", inode, datasync, handle);
2571             let data = self.find_handle(handle, inode)?;
2572 
2573             let file = data.file.lock();
2574             self.do_fsync(&*file, datasync)
2575         }
2576     }
2577 
fsyncdir( &self, _ctx: Context, inode: Inode, datasync: bool, handle: Handle, ) -> io::Result<()>2578     fn fsyncdir(
2579         &self,
2580         _ctx: Context,
2581         inode: Inode,
2582         datasync: bool,
2583         handle: Handle,
2584     ) -> io::Result<()> {
2585         if self.zero_message_opendir.load(Ordering::Relaxed) {
2586             let _trace = fs_trace!(self.tag, "fsyncdir (zero-message)", inode, datasync, handle);
2587             let data = self.find_inode(inode)?;
2588             self.do_fsync(&*data, datasync)
2589         } else {
2590             let _trace = fs_trace!(self.tag, "fsyncdir", inode, datasync, handle);
2591             let data = self.find_handle(handle, inode)?;
2592 
2593             let file = data.file.lock();
2594             self.do_fsync(&*file, datasync)
2595         }
2596     }
2597 
access(&self, ctx: Context, inode: Inode, mask: u32) -> io::Result<()>2598     fn access(&self, ctx: Context, inode: Inode, mask: u32) -> io::Result<()> {
2599         let _trace = fs_trace!(self.tag, "access", inode, mask);
2600         let data = self.find_inode(inode)?;
2601 
2602         let st = stat(&*data)?;
2603         let mode = mask as i32 & (libc::R_OK | libc::W_OK | libc::X_OK);
2604 
2605         if mode == libc::F_OK {
2606             // The file exists since we were able to call `stat(2)` on it.
2607             return Ok(());
2608         }
2609 
2610         if (mode & libc::R_OK) != 0 {
2611             if ctx.uid != 0
2612                 && (st.st_uid != ctx.uid || st.st_mode & 0o400 == 0)
2613                 && (st.st_gid != ctx.gid || st.st_mode & 0o040 == 0)
2614                 && st.st_mode & 0o004 == 0
2615             {
2616                 return Err(io::Error::from_raw_os_error(libc::EACCES));
2617             }
2618         }
2619 
2620         if (mode & libc::W_OK) != 0 {
2621             if ctx.uid != 0
2622                 && (st.st_uid != ctx.uid || st.st_mode & 0o200 == 0)
2623                 && (st.st_gid != ctx.gid || st.st_mode & 0o020 == 0)
2624                 && st.st_mode & 0o002 == 0
2625             {
2626                 return Err(io::Error::from_raw_os_error(libc::EACCES));
2627             }
2628         }
2629 
2630         // root can only execute something if it is executable by one of the owner, the group, or
2631         // everyone.
2632         if (mode & libc::X_OK) != 0 {
2633             if (ctx.uid != 0 || st.st_mode & 0o111 == 0)
2634                 && (st.st_uid != ctx.uid || st.st_mode & 0o100 == 0)
2635                 && (st.st_gid != ctx.gid || st.st_mode & 0o010 == 0)
2636                 && st.st_mode & 0o001 == 0
2637             {
2638                 return Err(io::Error::from_raw_os_error(libc::EACCES));
2639             }
2640         }
2641 
2642         Ok(())
2643     }
2644 
setxattr( &self, _ctx: Context, inode: Inode, name: &CStr, value: &[u8], flags: u32, ) -> io::Result<()>2645     fn setxattr(
2646         &self,
2647         _ctx: Context,
2648         inode: Inode,
2649         name: &CStr,
2650         value: &[u8],
2651         flags: u32,
2652     ) -> io::Result<()> {
2653         let _trace = fs_trace!(self.tag, "setxattr", inode, name, flags);
2654         // We can't allow the VM to set this xattr because an unprivileged process may use it to set
2655         // a privileged xattr.
2656         if self.cfg.rewrite_security_xattrs && name.to_bytes().starts_with(USER_VIRTIOFS_XATTR) {
2657             return Err(io::Error::from_raw_os_error(libc::EPERM));
2658         }
2659 
2660         let data = self.find_inode(inode)?;
2661         let name = self.rewrite_xattr_name(name);
2662         let file = data.file.lock();
2663         let o_path_file = (file.1 & libc::O_PATH) != 0;
2664         if o_path_file {
2665             // For FDs opened with `O_PATH`, we cannot call `fsetxattr` normally. Instead we emulate
2666             // an _at syscall by changing the CWD to /proc, running the path based syscall, and then
2667             // setting the CWD back to the root directory.
2668             let path = CString::new(format!("self/fd/{}", file.0.as_raw_descriptor()))
2669                 .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
2670 
2671             syscall!(self.with_proc_chdir(|| {
2672                 // SAFETY: this doesn't modify any memory and we check the return value.
2673                 unsafe {
2674                     libc::setxattr(
2675                         path.as_ptr(),
2676                         name.as_ptr(),
2677                         value.as_ptr() as *const libc::c_void,
2678                         value.len() as libc::size_t,
2679                         flags as c_int,
2680                     )
2681                 }
2682             }))?;
2683         } else {
2684             syscall!(
2685                 // For regular files and directories, we can just use fsetxattr.
2686                 // SAFETY: this doesn't modify any memory and we check the return value.
2687                 unsafe {
2688                     libc::fsetxattr(
2689                         file.0.as_raw_descriptor(),
2690                         name.as_ptr(),
2691                         value.as_ptr() as *const libc::c_void,
2692                         value.len() as libc::size_t,
2693                         flags as c_int,
2694                     )
2695                 }
2696             )?;
2697         }
2698 
2699         Ok(())
2700     }
2701 
getxattr( &self, _ctx: Context, inode: Inode, name: &CStr, size: u32, ) -> io::Result<GetxattrReply>2702     fn getxattr(
2703         &self,
2704         _ctx: Context,
2705         inode: Inode,
2706         name: &CStr,
2707         size: u32,
2708     ) -> io::Result<GetxattrReply> {
2709         let _trace = fs_trace!(self.tag, "getxattr", inode, name, size);
2710         // We don't allow the VM to set this xattr so we also pretend there is no value associated
2711         // with it.
2712         if self.cfg.rewrite_security_xattrs && name.to_bytes().starts_with(USER_VIRTIOFS_XATTR) {
2713             return Err(io::Error::from_raw_os_error(libc::ENODATA));
2714         }
2715 
2716         let data = self.find_inode(inode)?;
2717         let name = self.rewrite_xattr_name(name);
2718         let mut buf = vec![0u8; size as usize];
2719 
2720         // SAFETY: this will only modify the contents of `buf`.
2721         let res = self.do_getxattr(&data, &name, &mut buf[..])?;
2722         if size == 0 {
2723             Ok(GetxattrReply::Count(res as u32))
2724         } else {
2725             buf.truncate(res);
2726             Ok(GetxattrReply::Value(buf))
2727         }
2728     }
2729 
listxattr(&self, _ctx: Context, inode: Inode, size: u32) -> io::Result<ListxattrReply>2730     fn listxattr(&self, _ctx: Context, inode: Inode, size: u32) -> io::Result<ListxattrReply> {
2731         let _trace = fs_trace!(self.tag, "listxattr", inode, size);
2732         let data = self.find_inode(inode)?;
2733 
2734         let mut buf = vec![0u8; size as usize];
2735 
2736         let file = data.file.lock();
2737         let o_path_file = (file.1 & libc::O_PATH) != 0;
2738         let res = if o_path_file {
2739             // For FDs opened with `O_PATH`, we cannot call `flistxattr` normally. Instead we
2740             // emulate an _at syscall by changing the CWD to /proc, running the path based syscall,
2741             // and then setting the CWD back to the root directory.
2742             let path = CString::new(format!("self/fd/{}", file.0.as_raw_descriptor()))
2743                 .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
2744 
2745             // SAFETY: this will only modify `buf` and we check the return value.
2746             syscall!(self.with_proc_chdir(|| unsafe {
2747                 libc::listxattr(
2748                     path.as_ptr(),
2749                     buf.as_mut_ptr() as *mut libc::c_char,
2750                     buf.len() as libc::size_t,
2751                 )
2752             }))?
2753         } else {
2754             // For regular files and directories, we can just flistxattr.
2755             // SAFETY: this will only write to `buf` and we check the return value.
2756             syscall!(unsafe {
2757                 libc::flistxattr(
2758                     file.0.as_raw_descriptor(),
2759                     buf.as_mut_ptr() as *mut libc::c_char,
2760                     buf.len() as libc::size_t,
2761                 )
2762             })?
2763         };
2764 
2765         if size == 0 {
2766             Ok(ListxattrReply::Count(res as u32))
2767         } else {
2768             buf.truncate(res as usize);
2769 
2770             if self.cfg.rewrite_security_xattrs {
2771                 strip_xattr_prefix(&mut buf);
2772             }
2773             Ok(ListxattrReply::Names(buf))
2774         }
2775     }
2776 
removexattr(&self, _ctx: Context, inode: Inode, name: &CStr) -> io::Result<()>2777     fn removexattr(&self, _ctx: Context, inode: Inode, name: &CStr) -> io::Result<()> {
2778         let _trace = fs_trace!(self.tag, "removexattr", inode, name);
2779         // We don't allow the VM to set this xattr so we also pretend there is no value associated
2780         // with it.
2781         if self.cfg.rewrite_security_xattrs && name.to_bytes().starts_with(USER_VIRTIOFS_XATTR) {
2782             return Err(io::Error::from_raw_os_error(libc::ENODATA));
2783         }
2784 
2785         let data = self.find_inode(inode)?;
2786         let name = self.rewrite_xattr_name(name);
2787 
2788         let file = data.file.lock();
2789         let o_path_file = (file.1 & libc::O_PATH) != 0;
2790         if o_path_file {
2791             // For files opened with `O_PATH`, we cannot call `fremovexattr` normally. Instead we
2792             // emulate an _at syscall by changing the CWD to /proc, running the path based syscall,
2793             // and then setting the CWD back to the root directory.
2794             let path = CString::new(format!("self/fd/{}", file.0.as_raw_descriptor()))
2795                 .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
2796 
2797             syscall!(self.with_proc_chdir(||
2798                     // SAFETY: this doesn't modify any memory and we check the return value.
2799                     unsafe { libc::removexattr(path.as_ptr(), name.as_ptr()) }))?;
2800         } else {
2801             // For regular files and directories, we can just use fremovexattr.
2802             syscall!(
2803                 // SAFETY: this doesn't modify any memory and we check the return value.
2804                 unsafe { libc::fremovexattr(file.0.as_raw_descriptor(), name.as_ptr()) }
2805             )?;
2806         }
2807 
2808         Ok(())
2809     }
2810 
fallocate( &self, _ctx: Context, inode: Inode, handle: Handle, mode: u32, offset: u64, length: u64, ) -> io::Result<()>2811     fn fallocate(
2812         &self,
2813         _ctx: Context,
2814         inode: Inode,
2815         handle: Handle,
2816         mode: u32,
2817         offset: u64,
2818         length: u64,
2819     ) -> io::Result<()> {
2820         let _trace = fs_trace!(self.tag, "fallocate", inode, handle, mode, offset, length);
2821 
2822         let data: Arc<dyn AsRawDescriptor> = if self.zero_message_open.load(Ordering::Relaxed) {
2823             let data = self.find_inode(inode)?;
2824 
2825             {
2826                 // fallocate needs a writable fd
2827                 let mut file = data.file.lock();
2828                 let mut flags = file.1;
2829                 match flags & libc::O_ACCMODE {
2830                     libc::O_RDONLY => {
2831                         flags &= !libc::O_RDONLY;
2832                         flags |= libc::O_RDWR;
2833 
2834                         // We need to get a writable handle for this file.
2835                         let newfile = self.open_fd(file.0.as_raw_descriptor(), libc::O_RDWR)?;
2836                         *file = (newfile, flags);
2837                     }
2838                     libc::O_WRONLY | libc::O_RDWR => {}
2839                     _ => panic!("Unexpected flags: {:#x}", flags),
2840                 }
2841             }
2842 
2843             data
2844         } else {
2845             self.find_handle(handle, inode)?
2846         };
2847 
2848         let fd = data.as_raw_descriptor();
2849         // SAFETY: this doesn't modify any memory and we check the return value.
2850         syscall!(unsafe {
2851             libc::fallocate64(
2852                 fd,
2853                 mode as libc::c_int,
2854                 offset as libc::off64_t,
2855                 length as libc::off64_t,
2856             )
2857         })?;
2858 
2859         Ok(())
2860     }
2861 
2862     #[allow(clippy::unnecessary_cast)]
ioctl<R: io::Read>( &self, ctx: Context, inode: Inode, handle: Handle, _flags: IoctlFlags, cmd: u32, _arg: u64, in_size: u32, out_size: u32, r: R, ) -> io::Result<IoctlReply>2863     fn ioctl<R: io::Read>(
2864         &self,
2865         ctx: Context,
2866         inode: Inode,
2867         handle: Handle,
2868         _flags: IoctlFlags,
2869         cmd: u32,
2870         _arg: u64,
2871         in_size: u32,
2872         out_size: u32,
2873         r: R,
2874     ) -> io::Result<IoctlReply> {
2875         let _trace = fs_trace!(self.tag, "ioctl", inode, handle, cmd, in_size, out_size);
2876 
2877         const GET_ENCRYPTION_POLICY_EX: u32 = FS_IOC_GET_ENCRYPTION_POLICY_EX() as u32;
2878         const GET_FSXATTR: u32 = FS_IOC_FSGETXATTR() as u32;
2879         const SET_FSXATTR: u32 = FS_IOC_FSSETXATTR() as u32;
2880         const GET_FLAGS32: u32 = FS_IOC32_GETFLAGS() as u32;
2881         const SET_FLAGS32: u32 = FS_IOC32_SETFLAGS() as u32;
2882         const GET_FLAGS64: u32 = FS_IOC64_GETFLAGS() as u32;
2883         const SET_FLAGS64: u32 = FS_IOC64_SETFLAGS() as u32;
2884         const ENABLE_VERITY: u32 = FS_IOC_ENABLE_VERITY() as u32;
2885         const MEASURE_VERITY: u32 = FS_IOC_MEASURE_VERITY() as u32;
2886 
2887         match cmd {
2888             GET_ENCRYPTION_POLICY_EX => self.get_encryption_policy_ex(inode, handle, r),
2889             GET_FSXATTR => {
2890                 if out_size < size_of::<fsxattr>() as u32 {
2891                     Err(io::Error::from_raw_os_error(libc::ENOMEM))
2892                 } else {
2893                     self.get_fsxattr(inode, handle)
2894                 }
2895             }
2896             SET_FSXATTR => {
2897                 if in_size < size_of::<fsxattr>() as u32 {
2898                     Err(io::Error::from_raw_os_error(libc::EINVAL))
2899                 } else {
2900                     self.set_fsxattr(ctx, inode, handle, r)
2901                 }
2902             }
2903             GET_FLAGS32 | GET_FLAGS64 => {
2904                 if out_size < size_of::<c_int>() as u32 {
2905                     Err(io::Error::from_raw_os_error(libc::ENOMEM))
2906                 } else {
2907                     self.get_flags(inode, handle)
2908                 }
2909             }
2910             SET_FLAGS32 | SET_FLAGS64 => {
2911                 if in_size < size_of::<c_int>() as u32 {
2912                     Err(io::Error::from_raw_os_error(libc::ENOMEM))
2913                 } else {
2914                     self.set_flags(ctx, inode, handle, r)
2915                 }
2916             }
2917             ENABLE_VERITY => {
2918                 if in_size < size_of::<fsverity_enable_arg>() as u32 {
2919                     Err(io::Error::from_raw_os_error(libc::ENOMEM))
2920                 } else {
2921                     self.enable_verity(inode, handle, r)
2922                 }
2923             }
2924             MEASURE_VERITY => {
2925                 if in_size < size_of::<fsverity_digest>() as u32
2926                     || out_size < size_of::<fsverity_digest>() as u32
2927                 {
2928                     Err(io::Error::from_raw_os_error(libc::ENOMEM))
2929                 } else {
2930                     self.measure_verity(inode, handle, r, out_size)
2931                 }
2932             }
2933             _ => Err(io::Error::from_raw_os_error(libc::ENOTTY)),
2934         }
2935     }
2936 
copy_file_range( &self, ctx: Context, inode_src: Inode, handle_src: Handle, offset_src: u64, inode_dst: Inode, handle_dst: Handle, offset_dst: u64, length: u64, flags: u64, ) -> io::Result<usize>2937     fn copy_file_range(
2938         &self,
2939         ctx: Context,
2940         inode_src: Inode,
2941         handle_src: Handle,
2942         offset_src: u64,
2943         inode_dst: Inode,
2944         handle_dst: Handle,
2945         offset_dst: u64,
2946         length: u64,
2947         flags: u64,
2948     ) -> io::Result<usize> {
2949         let _trace = fs_trace!(
2950             self.tag,
2951             "copy_file_range",
2952             inode_src,
2953             handle_src,
2954             offset_src,
2955             inode_dst,
2956             handle_dst,
2957             offset_dst,
2958             length,
2959             flags
2960         );
2961         // We need to change credentials during a write so that the kernel will remove setuid or
2962         // setgid bits from the file if it was written to by someone other than the owner.
2963         let (_uid, _gid) = set_creds(ctx.uid, ctx.gid)?;
2964         let (src_data, dst_data): (Arc<dyn AsRawDescriptor>, Arc<dyn AsRawDescriptor>) =
2965             if self.zero_message_open.load(Ordering::Relaxed) {
2966                 (self.find_inode(inode_src)?, self.find_inode(inode_dst)?)
2967             } else {
2968                 (
2969                     self.find_handle(handle_src, inode_src)?,
2970                     self.find_handle(handle_dst, inode_dst)?,
2971                 )
2972             };
2973 
2974         let src = src_data.as_raw_descriptor();
2975         let dst = dst_data.as_raw_descriptor();
2976 
2977         Ok(syscall!(
2978             // SAFETY: this call is safe because it doesn't modify any memory and we
2979             // check the return value.
2980             unsafe {
2981                 libc::syscall(
2982                     libc::SYS_copy_file_range,
2983                     src,
2984                     &offset_src,
2985                     dst,
2986                     &offset_dst,
2987                     length,
2988                     flags,
2989                 )
2990             }
2991         )? as usize)
2992     }
2993 
set_up_mapping<M: Mapper>( &self, _ctx: Context, inode: Self::Inode, _handle: Self::Handle, file_offset: u64, mem_offset: u64, size: usize, prot: u32, mapper: M, ) -> io::Result<()>2994     fn set_up_mapping<M: Mapper>(
2995         &self,
2996         _ctx: Context,
2997         inode: Self::Inode,
2998         _handle: Self::Handle,
2999         file_offset: u64,
3000         mem_offset: u64,
3001         size: usize,
3002         prot: u32,
3003         mapper: M,
3004     ) -> io::Result<()> {
3005         let _trace = fs_trace!(
3006             self.tag,
3007             "set_up_mapping",
3008             inode,
3009             file_offset,
3010             mem_offset,
3011             size,
3012             prot
3013         );
3014         if !self.cfg.use_dax {
3015             return Err(io::Error::from_raw_os_error(libc::ENOSYS));
3016         }
3017 
3018         let read = prot & libc::PROT_READ as u32 != 0;
3019         let write = prot & libc::PROT_WRITE as u32 != 0;
3020         let (mmap_flags, prot) = match (read, write) {
3021             (true, true) => (libc::O_RDWR, Protection::read_write()),
3022             (true, false) => (libc::O_RDONLY, Protection::read()),
3023             // Write-only is mapped to O_RDWR since mmap always requires an fd opened for reading.
3024             (false, true) => (libc::O_RDWR, Protection::write()),
3025             (false, false) => return Err(io::Error::from_raw_os_error(libc::EINVAL)),
3026         };
3027 
3028         let data = self.find_inode(inode)?;
3029 
3030         if self.zero_message_open.load(Ordering::Relaxed) {
3031             let mut file = data.file.lock();
3032             let mut open_flags = file.1;
3033             match (mmap_flags, open_flags & libc::O_ACCMODE) {
3034                 (libc::O_RDONLY, libc::O_WRONLY)
3035                 | (libc::O_RDWR, libc::O_RDONLY)
3036                 | (libc::O_RDWR, libc::O_WRONLY) => {
3037                     // We have a read-only or write-only fd and we need to upgrade it.
3038                     open_flags &= !libc::O_ACCMODE;
3039                     open_flags |= libc::O_RDWR;
3040 
3041                     let newfile = self.open_fd(file.0.as_raw_descriptor(), libc::O_RDWR)?;
3042                     *file = (newfile, open_flags);
3043                 }
3044                 (libc::O_RDONLY, libc::O_RDONLY)
3045                 | (libc::O_RDONLY, libc::O_RDWR)
3046                 | (libc::O_RDWR, libc::O_RDWR) => {}
3047                 (m, o) => panic!(
3048                     "Unexpected combination of access flags: ({:#x}, {:#x})",
3049                     m, o
3050                 ),
3051             }
3052             mapper.map(mem_offset, size, &file.0, file_offset, prot)
3053         } else {
3054             let file = self.open_inode(&data, mmap_flags | libc::O_NONBLOCK)?;
3055             mapper.map(mem_offset, size, &file, file_offset, prot)
3056         }
3057     }
3058 
remove_mapping<M: Mapper>(&self, msgs: &[RemoveMappingOne], mapper: M) -> io::Result<()>3059     fn remove_mapping<M: Mapper>(&self, msgs: &[RemoveMappingOne], mapper: M) -> io::Result<()> {
3060         let _trace = fs_trace!(self.tag, "remove_mapping", msgs);
3061         if !self.cfg.use_dax {
3062             return Err(io::Error::from_raw_os_error(libc::ENOSYS));
3063         }
3064 
3065         for RemoveMappingOne { moffset, len } in msgs {
3066             mapper.unmap(*moffset, *len)?;
3067         }
3068         Ok(())
3069     }
3070 
atomic_open( &self, ctx: Context, parent: Self::Inode, name: &CStr, mode: u32, flags: u32, umask: u32, security_ctx: Option<&CStr>, ) -> io::Result<(Entry, Option<Self::Handle>, OpenOptions)>3071     fn atomic_open(
3072         &self,
3073         ctx: Context,
3074         parent: Self::Inode,
3075         name: &CStr,
3076         mode: u32,
3077         flags: u32,
3078         umask: u32,
3079         security_ctx: Option<&CStr>,
3080     ) -> io::Result<(Entry, Option<Self::Handle>, OpenOptions)> {
3081         let _trace = fs_trace!(
3082             self.tag,
3083             "atomic_open",
3084             parent,
3085             name,
3086             mode,
3087             flags,
3088             umask,
3089             security_ctx
3090         );
3091         let (_uid, _gid) = set_creds(ctx.uid, ctx.gid)?;
3092 
3093         // Perform lookup but not create negative dentry
3094         let data = self.find_inode(parent)?;
3095 
3096         // This lookup serves two purposes:
3097         // 1. If the O_CREATE flag is not set, it retrieves the d_entry for the file.
3098         // 2. If the O_CREATE flag is set, it checks whether the file exists.
3099         let res = self.do_lookup_with_casefold_fallback(&data, name);
3100 
3101         if let Err(e) = res {
3102             if e.kind() == std::io::ErrorKind::NotFound && (flags as i32 & libc::O_CREAT) != 0 {
3103                 // If the file did not exist & O_CREAT is set,
3104                 // create file & set FILE_CREATED bits in open options
3105                 let (entry, handler, mut opts) =
3106                     self.create(ctx, parent, name, mode, flags, umask, security_ctx)?;
3107                 opts |= OpenOptions::FILE_CREATED;
3108                 return Ok((entry, handler, opts));
3109             } else if e.kind() == std::io::ErrorKind::NotFound
3110                 && !self.cfg.negative_timeout.is_zero()
3111             {
3112                 return Ok((
3113                     Entry::new_negative(self.cfg.negative_timeout),
3114                     None,
3115                     OpenOptions::empty(),
3116                 ));
3117             }
3118             return Err(e);
3119         }
3120 
3121         // SAFETY: checked res is not error before
3122         let entry = res.unwrap();
3123 
3124         if entry.attr.st_mode & libc::S_IFMT == libc::S_IFLNK {
3125             return Ok((entry, None, OpenOptions::empty()));
3126         }
3127 
3128         if (flags as i32 & (libc::O_CREAT | libc::O_EXCL)) == (libc::O_CREAT | libc::O_EXCL) {
3129             return Err(eexist());
3130         }
3131 
3132         let (handler, opts) = if self.zero_message_open.load(Ordering::Relaxed) {
3133             (None, OpenOptions::KEEP_CACHE)
3134         } else {
3135             let (handler, opts) = self.do_open(entry.inode, flags)?;
3136             (handler, opts)
3137         };
3138         Ok((entry, handler, opts))
3139     }
3140 }
3141 
3142 #[cfg(test)]
3143 mod tests {
3144     use std::path::Path;
3145 
3146     use named_lock::NamedLock;
3147     use tempfile::TempDir;
3148 
3149     use super::*;
3150 
3151     const UNITTEST_LOCK_NAME: &str = "passthroughfs_unittest_lock";
3152 
3153     // Create an instance of `Context` with valid uid, gid, and pid.
3154     // The correct ids are necessary for test cases where new files are created.
get_context() -> Context3155     fn get_context() -> Context {
3156         // SAFETY: both calls take no parameters and only return an integer value. The kernel also
3157         // guarantees that they can never fail.
3158         let uid = unsafe { libc::syscall(SYS_GETEUID) as libc::uid_t };
3159         // SAFETY: both calls take no parameters and only return an integer value. The kernel also
3160         // guarantees that they can never fail.
3161         let gid = unsafe { libc::syscall(SYS_GETEGID) as libc::gid_t };
3162         let pid = std::process::id() as libc::pid_t;
3163         Context { uid, gid, pid }
3164     }
3165 
3166     /// Creates the given directories and files under `temp_dir`.
create_test_data(temp_dir: &TempDir, dirs: &[&str], files: &[&str])3167     fn create_test_data(temp_dir: &TempDir, dirs: &[&str], files: &[&str]) {
3168         let path = temp_dir.path();
3169 
3170         for d in dirs {
3171             std::fs::create_dir_all(path.join(d)).unwrap();
3172         }
3173 
3174         for f in files {
3175             File::create(path.join(f)).unwrap();
3176         }
3177     }
3178 
3179     /// Looks up the given `path` in `fs`.
lookup(fs: &PassthroughFs, path: &Path) -> io::Result<Inode>3180     fn lookup(fs: &PassthroughFs, path: &Path) -> io::Result<Inode> {
3181         let mut inode = 1;
3182         let ctx = get_context();
3183         for name in path.iter() {
3184             let name = CString::new(name.to_str().unwrap()).unwrap();
3185             let ent = match fs.lookup(ctx, inode, &name) {
3186                 Ok(ent) => ent,
3187                 Err(e) => {
3188                     return Err(e);
3189                 }
3190             };
3191             inode = ent.inode;
3192         }
3193         Ok(inode)
3194     }
3195 
3196     /// Creates a file at the given `path`.
create(fs: &PassthroughFs, path: &Path) -> io::Result<Entry>3197     fn create(fs: &PassthroughFs, path: &Path) -> io::Result<Entry> {
3198         let parent = path.parent().unwrap();
3199         let filename = CString::new(path.file_name().unwrap().to_str().unwrap()).unwrap();
3200         let parent_inode = lookup(fs, parent)?;
3201         let ctx = get_context();
3202         let security_ctx = None;
3203         fs.create(
3204             ctx,
3205             parent_inode,
3206             &filename,
3207             0o666,
3208             libc::O_RDWR as u32,
3209             0,
3210             security_ctx,
3211         )
3212         .map(|(entry, _, _)| entry)
3213     }
3214 
3215     /// Removes a file at the given `path`.
unlink(fs: &PassthroughFs, path: &Path) -> io::Result<()>3216     fn unlink(fs: &PassthroughFs, path: &Path) -> io::Result<()> {
3217         let parent = path.parent().unwrap();
3218         let filename = CString::new(path.file_name().unwrap().to_str().unwrap()).unwrap();
3219         let parent_inode = lookup(fs, parent)?;
3220         let ctx = get_context();
3221         fs.unlink(ctx, parent_inode, &filename)
3222     }
3223 
3224     /// Forgets cache.
forget(fs: &PassthroughFs, path: &Path) -> io::Result<()>3225     fn forget(fs: &PassthroughFs, path: &Path) -> io::Result<()> {
3226         let ctx = get_context();
3227         let inode = lookup(fs, path)?;
3228         // Pass `u64::MAX` to ensure that the refcount goes to 0 and we forget inode.
3229         fs.forget(ctx, inode, u64::MAX);
3230         Ok(())
3231     }
3232 
3233     /// Looks up and open the given `path` in `fs`.
atomic_open( fs: &PassthroughFs, path: &Path, mode: u32, flags: u32, umask: u32, security_ctx: Option<&CStr>, ) -> io::Result<(Entry, Option<Handle>, OpenOptions)>3234     fn atomic_open(
3235         fs: &PassthroughFs,
3236         path: &Path,
3237         mode: u32,
3238         flags: u32,
3239         umask: u32,
3240         security_ctx: Option<&CStr>,
3241     ) -> io::Result<(Entry, Option<Handle>, OpenOptions)> {
3242         let mut inode = 1;
3243         let ctx = get_context();
3244 
3245         let path_vec: Vec<_> = path.iter().collect();
3246         let vec_len = path_vec.len();
3247 
3248         // Do lookup before util (vec_len-1)-th pathname, this operation is to simulate
3249         // the behavior of VFS, since when VFS call atomic_open only at last look up.
3250         for name in &path_vec[0..vec_len - 1] {
3251             let name = CString::new(name.to_str().unwrap()).unwrap();
3252             let ent = fs.lookup(ctx, inode, &name)?;
3253             inode = ent.inode;
3254         }
3255 
3256         let name = CString::new(path_vec[vec_len - 1].to_str().unwrap()).unwrap();
3257 
3258         fs.atomic_open(ctx, inode, &name, mode, flags, umask, security_ctx)
3259     }
3260 
symlink( fs: &PassthroughFs, linkname: &Path, name: &Path, security_ctx: Option<&CStr>, ) -> io::Result<Entry>3261     fn symlink(
3262         fs: &PassthroughFs,
3263         linkname: &Path,
3264         name: &Path,
3265         security_ctx: Option<&CStr>,
3266     ) -> io::Result<Entry> {
3267         let inode = 1;
3268         let ctx = get_context();
3269         let name = CString::new(name.to_str().unwrap()).unwrap();
3270         let linkname = CString::new(linkname.to_str().unwrap()).unwrap();
3271         fs.symlink(ctx, &linkname, inode, &name, security_ctx)
3272     }
3273 
3274     #[test]
rewrite_xattr_names()3275     fn rewrite_xattr_names() {
3276         // Since PassthroughFs may executes process-wide operations such as `fchdir`, acquire
3277         // `NamedLock` before starting each unit test creating a `PassthroughFs` instance.
3278         let lock = NamedLock::create(UNITTEST_LOCK_NAME).expect("create named lock");
3279         let _guard = lock.lock().expect("acquire named lock");
3280 
3281         let cfg = Config {
3282             rewrite_security_xattrs: true,
3283             ..Default::default()
3284         };
3285 
3286         let p = PassthroughFs::new("tag", cfg).expect("Failed to create PassthroughFs");
3287 
3288         // Selinux shouldn't get overwritten.
3289         // SAFETY: trivially safe
3290         let selinux = unsafe { CStr::from_bytes_with_nul_unchecked(b"security.selinux\0") };
3291         assert_eq!(p.rewrite_xattr_name(selinux).to_bytes(), selinux.to_bytes());
3292 
3293         // user, trusted, and system should not be changed either.
3294         // SAFETY: trivially safe
3295         let user = unsafe { CStr::from_bytes_with_nul_unchecked(b"user.foobar\0") };
3296         assert_eq!(p.rewrite_xattr_name(user).to_bytes(), user.to_bytes());
3297         // SAFETY: trivially safe
3298         let trusted = unsafe { CStr::from_bytes_with_nul_unchecked(b"trusted.foobar\0") };
3299         assert_eq!(p.rewrite_xattr_name(trusted).to_bytes(), trusted.to_bytes());
3300         // SAFETY: trivially safe
3301         let system = unsafe { CStr::from_bytes_with_nul_unchecked(b"system.foobar\0") };
3302         assert_eq!(p.rewrite_xattr_name(system).to_bytes(), system.to_bytes());
3303 
3304         // sehash should be re-written.
3305         // SAFETY: trivially safe
3306         let sehash = unsafe { CStr::from_bytes_with_nul_unchecked(b"security.sehash\0") };
3307         assert_eq!(
3308             p.rewrite_xattr_name(sehash).to_bytes(),
3309             b"user.virtiofs.security.sehash"
3310         );
3311     }
3312 
3313     #[test]
strip_xattr_names()3314     fn strip_xattr_names() {
3315         let only_nuls = b"\0\0\0\0\0";
3316         let mut actual = only_nuls.to_vec();
3317         strip_xattr_prefix(&mut actual);
3318         assert_eq!(&actual[..], &only_nuls[..]);
3319 
3320         let no_nuls = b"security.sehashuser.virtiofs";
3321         let mut actual = no_nuls.to_vec();
3322         strip_xattr_prefix(&mut actual);
3323         assert_eq!(&actual[..], &no_nuls[..]);
3324 
3325         let empty = b"";
3326         let mut actual = empty.to_vec();
3327         strip_xattr_prefix(&mut actual);
3328         assert_eq!(&actual[..], &empty[..]);
3329 
3330         let no_strippable_names = b"security.selinux\0user.foobar\0system.test\0";
3331         let mut actual = no_strippable_names.to_vec();
3332         strip_xattr_prefix(&mut actual);
3333         assert_eq!(&actual[..], &no_strippable_names[..]);
3334 
3335         let only_strippable_names = b"user.virtiofs.security.sehash\0user.virtiofs.security.wat\0";
3336         let mut actual = only_strippable_names.to_vec();
3337         strip_xattr_prefix(&mut actual);
3338         assert_eq!(&actual[..], b"security.sehash\0security.wat\0");
3339 
3340         let mixed_names = b"user.virtiofs.security.sehash\0security.selinux\0user.virtiofs.security.wat\0user.foobar\0";
3341         let mut actual = mixed_names.to_vec();
3342         strip_xattr_prefix(&mut actual);
3343         let expected = b"security.sehash\0security.selinux\0security.wat\0user.foobar\0";
3344         assert_eq!(&actual[..], &expected[..]);
3345 
3346         let no_nul_with_prefix = b"user.virtiofs.security.sehash";
3347         let mut actual = no_nul_with_prefix.to_vec();
3348         strip_xattr_prefix(&mut actual);
3349         assert_eq!(&actual[..], b"security.sehash");
3350     }
3351 
3352     #[test]
lookup_files()3353     fn lookup_files() {
3354         // Since PassthroughFs may executes process-wide operations such as `fchdir`, acquire
3355         // `NamedLock` before starting each unit test creating a `PassthroughFs` instance.
3356         let lock = NamedLock::create(UNITTEST_LOCK_NAME).expect("create named lock");
3357         let _guard = lock.lock().expect("acquire named lock");
3358 
3359         let temp_dir = TempDir::new().unwrap();
3360         create_test_data(&temp_dir, &["dir"], &["a.txt", "dir/b.txt"]);
3361 
3362         let cfg = Default::default();
3363         let fs = PassthroughFs::new("tag", cfg).unwrap();
3364 
3365         let capable = FsOptions::empty();
3366         fs.init(capable).unwrap();
3367 
3368         assert!(lookup(&fs, &temp_dir.path().join("a.txt")).is_ok());
3369         assert!(lookup(&fs, &temp_dir.path().join("dir")).is_ok());
3370         assert!(lookup(&fs, &temp_dir.path().join("dir/b.txt")).is_ok());
3371 
3372         assert_eq!(
3373             lookup(&fs, &temp_dir.path().join("nonexistent-file"))
3374                 .expect_err("file must not exist")
3375                 .kind(),
3376             io::ErrorKind::NotFound
3377         );
3378         // "A.txt" is different from "a.txt".
3379         assert_eq!(
3380             lookup(&fs, &temp_dir.path().join("A.txt"))
3381                 .expect_err("file must not exist")
3382                 .kind(),
3383             io::ErrorKind::NotFound
3384         );
3385     }
3386 
3387     #[test]
lookup_files_ascii_casefold()3388     fn lookup_files_ascii_casefold() {
3389         // Since PassthroughFs may executes process-wide operations such as `fchdir`, acquire
3390         // `NamedLock` before starting each unit test creating a `PassthroughFs` instance.
3391         let lock = NamedLock::create(UNITTEST_LOCK_NAME).expect("create named lock");
3392         let _guard = lock.lock().expect("acquire named lock");
3393 
3394         let temp_dir = TempDir::new().unwrap();
3395         create_test_data(&temp_dir, &["dir"], &["a.txt", "dir/b.txt"]);
3396 
3397         let cfg = Config {
3398             ascii_casefold: true,
3399             ..Default::default()
3400         };
3401         let fs = PassthroughFs::new("tag", cfg).unwrap();
3402 
3403         let capable = FsOptions::empty();
3404         fs.init(capable).unwrap();
3405 
3406         // Ensure that "A.txt" is equated with "a.txt".
3407         let a_inode = lookup(&fs, &temp_dir.path().join("a.txt")).expect("a.txt must be found");
3408         assert_eq!(
3409             lookup(&fs, &temp_dir.path().join("A.txt")).expect("A.txt must exist"),
3410             a_inode
3411         );
3412 
3413         let dir_inode = lookup(&fs, &temp_dir.path().join("dir")).expect("dir must be found");
3414         assert_eq!(
3415             lookup(&fs, &temp_dir.path().join("DiR")).expect("DiR must exist"),
3416             dir_inode
3417         );
3418 
3419         let b_inode =
3420             lookup(&fs, &temp_dir.path().join("dir/b.txt")).expect("dir/b.txt must be found");
3421         assert_eq!(
3422             lookup(&fs, &temp_dir.path().join("dIr/B.TxT")).expect("dIr/B.TxT must exist"),
3423             b_inode
3424         );
3425 
3426         assert_eq!(
3427             lookup(&fs, &temp_dir.path().join("nonexistent-file"))
3428                 .expect_err("file must not exist")
3429                 .kind(),
3430             io::ErrorKind::NotFound
3431         );
3432     }
3433 
test_create_and_remove(ascii_casefold: bool)3434     fn test_create_and_remove(ascii_casefold: bool) {
3435         // Since PassthroughFs may executes process-wide operations such as `fchdir`, acquire
3436         // `NamedLock` before starting each unit test creating a `PassthroughFs` instance.
3437         let lock = NamedLock::create(UNITTEST_LOCK_NAME).expect("create named lock");
3438         let _guard = lock.lock().expect("acquire named lock");
3439 
3440         let temp_dir = TempDir::new().unwrap();
3441         let timeout = Duration::from_millis(10);
3442         let cfg = Config {
3443             timeout,
3444             cache_policy: CachePolicy::Auto,
3445             ascii_casefold,
3446             ..Default::default()
3447         };
3448         let fs = PassthroughFs::new("tag", cfg).unwrap();
3449 
3450         let capable = FsOptions::empty();
3451         fs.init(capable).unwrap();
3452 
3453         // Create a.txt and b.txt.
3454         let a_path = temp_dir.path().join("a.txt");
3455         let b_path = temp_dir.path().join("b.txt");
3456         let a_entry = create(&fs, &a_path).expect("create a.txt");
3457         let b_entry = create(&fs, &b_path).expect("create b.txt");
3458         assert_eq!(
3459             a_entry.inode,
3460             lookup(&fs, &a_path).expect("lookup a.txt"),
3461             "Created file 'a.txt' must be looked up"
3462         );
3463         assert_eq!(
3464             b_entry.inode,
3465             lookup(&fs, &b_path).expect("lookup b.txt"),
3466             "Created file 'b.txt' must be looked up"
3467         );
3468 
3469         // Remove a.txt only
3470         unlink(&fs, &a_path).expect("Remove");
3471         assert_eq!(
3472             lookup(&fs, &a_path)
3473                 .expect_err("file must not exist")
3474                 .kind(),
3475             io::ErrorKind::NotFound,
3476             "a.txt must be removed"
3477         );
3478         // "A.TXT" must not be found regardless of whether casefold is enabled or not.
3479         let upper_a_path = temp_dir.path().join("A.TXT");
3480         assert_eq!(
3481             lookup(&fs, &upper_a_path)
3482                 .expect_err("file must not exist")
3483                 .kind(),
3484             io::ErrorKind::NotFound,
3485             "A.txt must be removed"
3486         );
3487 
3488         // Check if the host file system doesn't have a.txt but does b.txt.
3489         assert!(!a_path.exists(), "a.txt must be removed");
3490         assert!(b_path.exists(), "b.txt must exist");
3491     }
3492 
3493     #[test]
create_and_remove()3494     fn create_and_remove() {
3495         test_create_and_remove(false /* casefold */);
3496     }
3497 
3498     #[test]
create_and_remove_casefold()3499     fn create_and_remove_casefold() {
3500         test_create_and_remove(true /* casefold */);
3501     }
3502 
test_create_and_forget(ascii_casefold: bool)3503     fn test_create_and_forget(ascii_casefold: bool) {
3504         // Since PassthroughFs may executes process-wide operations such as `fchdir`, acquire
3505         // `NamedLock` before starting each unit test creating a `PassthroughFs` instance.
3506         let lock = NamedLock::create(UNITTEST_LOCK_NAME).expect("create named lock");
3507         let _guard = lock.lock().expect("acquire named lock");
3508 
3509         let temp_dir = TempDir::new().unwrap();
3510         let timeout = Duration::from_millis(10);
3511         let cfg = Config {
3512             timeout,
3513             cache_policy: CachePolicy::Auto,
3514             ascii_casefold,
3515             ..Default::default()
3516         };
3517         let fs = PassthroughFs::new("tag", cfg).unwrap();
3518 
3519         let capable = FsOptions::empty();
3520         fs.init(capable).unwrap();
3521 
3522         // Create a.txt.
3523         let a_path = temp_dir.path().join("a.txt");
3524         let a_entry = create(&fs, &a_path).expect("create a.txt");
3525         assert_eq!(
3526             a_entry.inode,
3527             lookup(&fs, &a_path).expect("lookup a.txt"),
3528             "Created file 'a.txt' must be looked up"
3529         );
3530 
3531         // Forget a.txt's inode from PassthroughFs's internal cache.
3532         forget(&fs, &a_path).expect("forget a.txt");
3533 
3534         if ascii_casefold {
3535             let upper_a_path = temp_dir.path().join("A.TXT");
3536             let new_a_inode = lookup(&fs, &upper_a_path).expect("lookup a.txt");
3537             assert_ne!(
3538                 a_entry.inode, new_a_inode,
3539                 "inode must be changed after forget()"
3540             );
3541             assert_eq!(
3542                 new_a_inode,
3543                 lookup(&fs, &a_path).expect("lookup a.txt"),
3544                 "inode must be same for a.txt and A.TXT"
3545             );
3546         } else {
3547             assert_ne!(
3548                 a_entry.inode,
3549                 lookup(&fs, &a_path).expect("lookup a.txt"),
3550                 "inode must be changed after forget()"
3551             );
3552         }
3553     }
3554 
3555     #[test]
create_and_forget()3556     fn create_and_forget() {
3557         test_create_and_forget(false /* ascii_casefold */);
3558     }
3559 
3560     #[test]
create_and_forget_casefold()3561     fn create_and_forget_casefold() {
3562         test_create_and_forget(true /* ascii_casefold */);
3563     }
3564 
3565     #[test]
casefold_lookup_cache()3566     fn casefold_lookup_cache() {
3567         let temp_dir = TempDir::new().unwrap();
3568         // Prepare `a.txt` before starting the test.
3569         create_test_data(&temp_dir, &[], &["a.txt"]);
3570 
3571         let cfg = Config {
3572             ascii_casefold: true,
3573             ..Default::default()
3574         };
3575         let fs = PassthroughFs::new("tag", cfg).unwrap();
3576 
3577         let capable = FsOptions::empty();
3578         fs.init(capable).unwrap();
3579 
3580         let parent = lookup(&fs, temp_dir.path()).expect("lookup temp_dir");
3581 
3582         // Since `a.txt` exists, "A.TXT" must exist.
3583         let large_a_path = temp_dir.path().join("A.TXT");
3584         // Looking up "A.TXT" must create a CasefoldCache entry.
3585         lookup(&fs, &large_a_path).expect("A.TXT must exist");
3586         assert!(fs.exists_in_casefold_cache(parent, &CString::new("A.TXT").unwrap()));
3587 
3588         // Create b.txt.
3589         let b_path = temp_dir.path().join("b.txt");
3590         create(&fs, &b_path).expect("create b.txt");
3591         // Then, b.txt must exists in the cache.
3592         assert!(fs.exists_in_casefold_cache(parent, &CString::new("B.TXT").unwrap()));
3593         // When removing b.txt, it must be removed from the cache as well.
3594         unlink(&fs, &b_path).expect("remove b.txt");
3595         assert!(!fs.exists_in_casefold_cache(parent, &CString::new("B.TXT").unwrap()));
3596     }
3597 
3598     #[test]
lookup_negative_cache()3599     fn lookup_negative_cache() {
3600         let temp_dir = TempDir::new().unwrap();
3601         // Prepare `a.txt` before starting the test.
3602         create_test_data(&temp_dir, &[], &[]);
3603 
3604         let cfg = Config {
3605             negative_timeout: Duration::from_secs(5),
3606             ..Default::default()
3607         };
3608         let fs = PassthroughFs::new("tag", cfg).unwrap();
3609 
3610         let capable = FsOptions::empty();
3611         fs.init(capable).unwrap();
3612 
3613         let a_path = temp_dir.path().join("a.txt");
3614         // a.txt hasn't existed yet.
3615         // Since negative_timeout is enabled, success with inode=0 is expected.
3616         assert_eq!(
3617             0,
3618             lookup(&fs, &a_path).expect("lookup a.txt"),
3619             "Entry with inode=0 is expected for non-existing file 'a.txt'"
3620         );
3621         // Create a.txt
3622         let a_entry = create(&fs, &a_path).expect("create a.txt");
3623         assert_eq!(
3624             a_entry.inode,
3625             lookup(&fs, &a_path).expect("lookup a.txt"),
3626             "Created file 'a.txt' must be looked up"
3627         );
3628         // Remove a.txt
3629         unlink(&fs, &a_path).expect("Remove");
3630         assert_eq!(
3631             0,
3632             lookup(&fs, &a_path).expect("lookup a.txt"),
3633             "Entry with inode=0 is expected for the removed file 'a.txt'"
3634         );
3635     }
3636     #[test]
test_atomic_open_existing_file()3637     fn test_atomic_open_existing_file() {
3638         atomic_open_existing_file(false);
3639     }
3640 
3641     #[test]
test_atomic_open_existing_file_zero_message()3642     fn test_atomic_open_existing_file_zero_message() {
3643         atomic_open_existing_file(true);
3644     }
3645 
atomic_open_existing_file(zero_message_open: bool)3646     fn atomic_open_existing_file(zero_message_open: bool) {
3647         // Since PassthroughFs may executes process-wide operations such as `fchdir`, acquire
3648         // `NamedLock` before starting each unit test creating a `PassthroughFs` instance.
3649         let lock = NamedLock::create(UNITTEST_LOCK_NAME).expect("create named lock");
3650         let _guard = lock.lock().expect("acquire named lock");
3651 
3652         let temp_dir = TempDir::new().unwrap();
3653         create_test_data(&temp_dir, &["dir"], &["a.txt", "dir/b.txt", "dir/c.txt"]);
3654 
3655         let cache_policy = match zero_message_open {
3656             true => CachePolicy::Always,
3657             false => CachePolicy::Auto,
3658         };
3659 
3660         let cfg = Config {
3661             cache_policy,
3662             ..Default::default()
3663         };
3664         let fs = PassthroughFs::new("tag", cfg).unwrap();
3665 
3666         let capable = FsOptions::ZERO_MESSAGE_OPEN;
3667         fs.init(capable).unwrap();
3668 
3669         // atomic_open with flag O_RDWR, should return positive dentry and file handler
3670         let res = atomic_open(
3671             &fs,
3672             &temp_dir.path().join("a.txt"),
3673             0o666,
3674             libc::O_RDWR as u32,
3675             0,
3676             None,
3677         );
3678         assert!(res.is_ok());
3679         let (entry, handler, open_options) = res.unwrap();
3680         assert_ne!(entry.inode, 0);
3681 
3682         if zero_message_open {
3683             assert!(handler.is_none());
3684             assert_eq!(open_options, OpenOptions::KEEP_CACHE);
3685         } else {
3686             assert!(handler.is_some());
3687             assert_ne!(
3688                 open_options & OpenOptions::FILE_CREATED,
3689                 OpenOptions::FILE_CREATED
3690             );
3691         }
3692 
3693         // atomic_open with flag O_RDWR |  O_CREATE, should return positive dentry and file handler
3694         let res = atomic_open(
3695             &fs,
3696             &temp_dir.path().join("dir/b.txt"),
3697             0o666,
3698             (libc::O_RDWR | libc::O_CREAT) as u32,
3699             0,
3700             None,
3701         );
3702         assert!(res.is_ok());
3703         let (entry, handler, open_options) = res.unwrap();
3704         assert_ne!(entry.inode, 0);
3705 
3706         if zero_message_open {
3707             assert!(handler.is_none());
3708             assert_eq!(open_options, OpenOptions::KEEP_CACHE);
3709         } else {
3710             assert!(handler.is_some());
3711             assert_ne!(
3712                 open_options & OpenOptions::FILE_CREATED,
3713                 OpenOptions::FILE_CREATED
3714             );
3715         }
3716 
3717         // atomic_open with flag O_RDWR | O_CREATE | O_EXCL, should return positive dentry and file
3718         // handler
3719         let res = atomic_open(
3720             &fs,
3721             &temp_dir.path().join("dir/c.txt"),
3722             0o666,
3723             (libc::O_RDWR | libc::O_CREAT | libc::O_EXCL) as u32,
3724             0,
3725             None,
3726         );
3727         assert!(res.is_err());
3728         let err_kind = res.unwrap_err().kind();
3729         assert_eq!(err_kind, io::ErrorKind::AlreadyExists);
3730     }
3731 
3732     #[test]
test_atomic_open_non_existing_file()3733     fn test_atomic_open_non_existing_file() {
3734         atomic_open_non_existing_file(false);
3735     }
3736 
3737     #[test]
test_atomic_open_non_existing_file_zero_message()3738     fn test_atomic_open_non_existing_file_zero_message() {
3739         atomic_open_non_existing_file(true);
3740     }
3741 
atomic_open_non_existing_file(zero_message_open: bool)3742     fn atomic_open_non_existing_file(zero_message_open: bool) {
3743         // Since PassthroughFs may executes process-wide operations such as `fchdir`, acquire
3744         // `NamedLock` before starting each unit test creating a `PassthroughFs` instance.
3745         let lock = NamedLock::create(UNITTEST_LOCK_NAME).expect("create named lock");
3746         let _guard = lock.lock().expect("acquire named lock");
3747 
3748         let temp_dir = TempDir::new().unwrap();
3749 
3750         let cache_policy = match zero_message_open {
3751             true => CachePolicy::Always,
3752             false => CachePolicy::Auto,
3753         };
3754 
3755         let cfg = Config {
3756             cache_policy,
3757             ..Default::default()
3758         };
3759         let fs = PassthroughFs::new("tag", cfg).unwrap();
3760 
3761         let capable = FsOptions::ZERO_MESSAGE_OPEN;
3762         fs.init(capable).unwrap();
3763 
3764         // atomic_open with flag O_RDWR, should return NO_EXIST error
3765         let res = atomic_open(
3766             &fs,
3767             &temp_dir.path().join("a.txt"),
3768             0o666,
3769             libc::O_RDWR as u32,
3770             0,
3771             None,
3772         );
3773         assert!(res.is_err());
3774         let err_kind = res.unwrap_err().kind();
3775         assert_eq!(err_kind, io::ErrorKind::NotFound);
3776 
3777         // atomic_open with flag O_RDWR | O_CREATE, should return positive dentry and file handler
3778         let res = atomic_open(
3779             &fs,
3780             &temp_dir.path().join("b.txt"),
3781             0o666,
3782             (libc::O_RDWR | libc::O_CREAT) as u32,
3783             0,
3784             None,
3785         );
3786         assert!(res.is_ok());
3787         let (entry, handler, open_options) = res.unwrap();
3788         assert_ne!(entry.inode, 0);
3789 
3790         if zero_message_open {
3791             assert!(handler.is_none());
3792             assert_eq!(
3793                 open_options & OpenOptions::KEEP_CACHE,
3794                 OpenOptions::KEEP_CACHE
3795             );
3796         } else {
3797             assert!(handler.is_some());
3798         }
3799         assert_eq!(
3800             open_options & OpenOptions::FILE_CREATED,
3801             OpenOptions::FILE_CREATED
3802         );
3803     }
3804 
3805     #[test]
atomic_open_symbol_link()3806     fn atomic_open_symbol_link() {
3807         // Since PassthroughFs may executes process-wide operations such as `fchdir`, acquire
3808         // `NamedLock` before starting each unit test creating a `PassthroughFs` instance.
3809         let lock = NamedLock::create(UNITTEST_LOCK_NAME).expect("create named lock");
3810         let _guard = lock.lock().expect("acquire named lock");
3811 
3812         let temp_dir = TempDir::new().unwrap();
3813         create_test_data(&temp_dir, &["dir"], &["a.txt"]);
3814 
3815         let cfg = Default::default();
3816         let fs = PassthroughFs::new("tag", cfg).unwrap();
3817 
3818         let capable = FsOptions::empty();
3819         fs.init(capable).unwrap();
3820 
3821         // atomic open the link destination file
3822         let res_dst = atomic_open(
3823             &fs,
3824             &temp_dir.path().join("a.txt"),
3825             0o666,
3826             libc::O_RDWR as u32,
3827             0,
3828             None,
3829         );
3830         assert!(res_dst.is_ok());
3831         let (entry_dst, handler_dst, _) = res_dst.unwrap();
3832         assert_ne!(entry_dst.inode, 0);
3833         assert!(handler_dst.is_some());
3834 
3835         // create depth 1 symbol link
3836         let sym1_res = symlink(
3837             &fs,
3838             &temp_dir.path().join("a.txt"),
3839             &temp_dir.path().join("blink"),
3840             None,
3841         );
3842         assert!(sym1_res.is_ok());
3843         let sym1_entry = sym1_res.unwrap();
3844         assert_ne!(sym1_entry.inode, 0);
3845 
3846         // atomic_open symbol link, should return dentry with no handler
3847         let res = atomic_open(
3848             &fs,
3849             &temp_dir.path().join("blink"),
3850             0o666,
3851             libc::O_RDWR as u32,
3852             0,
3853             None,
3854         );
3855         assert!(res.is_ok());
3856         let (entry, handler, open_options) = res.unwrap();
3857         assert_eq!(entry.inode, sym1_entry.inode);
3858         assert!(handler.is_none());
3859         assert_eq!(open_options, OpenOptions::empty());
3860 
3861         // delete link destination
3862         unlink(&fs, &temp_dir.path().join("a.txt")).expect("Remove");
3863         assert_eq!(
3864             lookup(&fs, &temp_dir.path().join("a.txt"))
3865                 .expect_err("file must not exist")
3866                 .kind(),
3867             io::ErrorKind::NotFound,
3868             "a.txt must be removed"
3869         );
3870 
3871         // after link destination removed, should still return valid dentry
3872         let res = atomic_open(
3873             &fs,
3874             &temp_dir.path().join("blink"),
3875             0o666,
3876             libc::O_RDWR as u32,
3877             0,
3878             None,
3879         );
3880         assert!(res.is_ok());
3881         let (entry, handler, open_options) = res.unwrap();
3882         assert_eq!(entry.inode, sym1_entry.inode);
3883         assert!(handler.is_none());
3884         assert_eq!(open_options, OpenOptions::empty());
3885     }
3886 }
3887