• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2019 The ChromiumOS Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 use std::borrow::Cow;
6 use std::cmp;
7 use std::collections::btree_map;
8 use std::collections::BTreeMap;
9 use std::ffi::CStr;
10 use std::ffi::CString;
11 use std::fs::File;
12 use std::io;
13 use std::mem;
14 use std::mem::size_of;
15 use std::mem::MaybeUninit;
16 use std::os::raw::c_int;
17 use std::os::raw::c_long;
18 use std::ptr::addr_of;
19 use std::ptr::addr_of_mut;
20 use std::str::FromStr;
21 use std::sync::atomic::AtomicBool;
22 use std::sync::atomic::AtomicU64;
23 use std::sync::atomic::Ordering;
24 use std::sync::Arc;
25 use std::time::Duration;
26 
27 use base::error;
28 use base::ioctl_ior_nr;
29 use base::ioctl_iow_nr;
30 use base::ioctl_iowr_nr;
31 use base::ioctl_with_mut_ptr;
32 use base::ioctl_with_ptr;
33 use base::syscall;
34 use base::AsRawDescriptor;
35 use base::FileFlags;
36 use base::FromRawDescriptor;
37 use base::RawDescriptor;
38 use data_model::zerocopy_from_reader;
39 use data_model::DataInit;
40 use fuse::filesystem::Context;
41 use fuse::filesystem::DirectoryIterator;
42 use fuse::filesystem::Entry;
43 use fuse::filesystem::FileSystem;
44 use fuse::filesystem::FsOptions;
45 use fuse::filesystem::GetxattrReply;
46 use fuse::filesystem::IoctlFlags;
47 use fuse::filesystem::IoctlReply;
48 use fuse::filesystem::ListxattrReply;
49 use fuse::filesystem::OpenOptions;
50 use fuse::filesystem::RemoveMappingOne;
51 use fuse::filesystem::SetattrValid;
52 use fuse::filesystem::ZeroCopyReader;
53 use fuse::filesystem::ZeroCopyWriter;
54 use fuse::filesystem::ROOT_ID;
55 use fuse::sys::WRITE_KILL_PRIV;
56 use fuse::Mapper;
57 #[cfg(feature = "arc_quota")]
58 use protobuf::Message;
59 use serde::Deserialize;
60 use serde::Serialize;
61 use sync::Mutex;
62 #[cfg(feature = "arc_quota")]
63 use system_api::client::OrgChromiumArcQuota;
64 #[cfg(feature = "arc_quota")]
65 use system_api::UserDataAuth::SetMediaRWDataFileProjectIdReply;
66 #[cfg(feature = "arc_quota")]
67 use system_api::UserDataAuth::SetMediaRWDataFileProjectIdRequest;
68 #[cfg(feature = "arc_quota")]
69 use system_api::UserDataAuth::SetMediaRWDataFileProjectInheritanceFlagReply;
70 #[cfg(feature = "arc_quota")]
71 use system_api::UserDataAuth::SetMediaRWDataFileProjectInheritanceFlagRequest;
72 use zerocopy::AsBytes;
73 use zerocopy::FromBytes;
74 
75 use crate::virtio::fs::caps::Capability;
76 use crate::virtio::fs::caps::Caps;
77 use crate::virtio::fs::caps::Set as CapSet;
78 use crate::virtio::fs::caps::Value as CapValue;
79 use crate::virtio::fs::multikey::MultikeyBTreeMap;
80 use crate::virtio::fs::read_dir::ReadDir;
81 
82 const EMPTY_CSTR: &[u8] = b"\0";
83 const ROOT_CSTR: &[u8] = b"/\0";
84 const PROC_CSTR: &[u8] = b"/proc\0";
85 
86 const USER_VIRTIOFS_XATTR: &[u8] = b"user.virtiofs.";
87 const SECURITY_XATTR: &[u8] = b"security.";
88 const SELINUX_XATTR: &[u8] = b"security.selinux";
89 
90 const FSCRYPT_KEY_DESCRIPTOR_SIZE: usize = 8;
91 const FSCRYPT_KEY_IDENTIFIER_SIZE: usize = 16;
92 
93 #[cfg(feature = "arc_quota")]
94 const FS_PROJINHERIT_FL: c_int = 0x20000000;
95 
96 // 25 seconds is the default timeout for dbus-send.
97 #[cfg(feature = "arc_quota")]
98 const DEFAULT_DBUS_TIMEOUT: Duration = Duration::from_secs(25);
99 
100 #[repr(C)]
101 #[derive(Clone, Copy, AsBytes, FromBytes)]
102 struct fscrypt_policy_v1 {
103     _version: u8,
104     _contents_encryption_mode: u8,
105     _filenames_encryption_mode: u8,
106     _flags: u8,
107     _master_key_descriptor: [u8; FSCRYPT_KEY_DESCRIPTOR_SIZE],
108 }
109 
110 #[repr(C)]
111 #[derive(Clone, Copy, AsBytes, FromBytes)]
112 struct fscrypt_policy_v2 {
113     _version: u8,
114     _contents_encryption_mode: u8,
115     _filenames_encryption_mode: u8,
116     _flags: u8,
117     __reserved: [u8; 4],
118     master_key_identifier: [u8; FSCRYPT_KEY_IDENTIFIER_SIZE],
119 }
120 
121 #[repr(C)]
122 #[derive(Copy, Clone, FromBytes)]
123 union fscrypt_policy {
124     _version: u8,
125     _v1: fscrypt_policy_v1,
126     _v2: fscrypt_policy_v2,
127 }
128 
129 #[repr(C)]
130 #[derive(Copy, Clone, FromBytes)]
131 struct fscrypt_get_policy_ex_arg {
132     policy_size: u64,       /* input/output */
133     policy: fscrypt_policy, /* output */
134 }
135 
136 unsafe impl DataInit for fscrypt_get_policy_ex_arg {}
137 
138 ioctl_iowr_nr!(FS_IOC_GET_ENCRYPTION_POLICY_EX, 'f' as u32, 22, [u8; 9]);
139 
140 #[repr(C)]
141 #[derive(Clone, Copy, AsBytes, FromBytes)]
142 struct fsxattr {
143     fsx_xflags: u32,     /* xflags field value (get/set) */
144     fsx_extsize: u32,    /* extsize field value (get/set)*/
145     fsx_nextents: u32,   /* nextents field value (get)	*/
146     fsx_projid: u32,     /* project identifier (get/set) */
147     fsx_cowextsize: u32, /* CoW extsize field value (get/set)*/
148     fsx_pad: [u8; 8],
149 }
150 
151 ioctl_ior_nr!(FS_IOC_FSGETXATTR, 'X' as u32, 31, fsxattr);
152 ioctl_iow_nr!(FS_IOC_FSSETXATTR, 'X' as u32, 32, fsxattr);
153 
154 ioctl_ior_nr!(FS_IOC_GETFLAGS, 'f' as u32, 1, c_long);
155 ioctl_iow_nr!(FS_IOC_SETFLAGS, 'f' as u32, 2, c_long);
156 
157 ioctl_ior_nr!(FS_IOC32_GETFLAGS, 'f' as u32, 1, u32);
158 ioctl_iow_nr!(FS_IOC32_SETFLAGS, 'f' as u32, 2, u32);
159 
160 ioctl_ior_nr!(FS_IOC64_GETFLAGS, 'f' as u32, 1, u64);
161 ioctl_iow_nr!(FS_IOC64_SETFLAGS, 'f' as u32, 2, u64);
162 
163 #[repr(C)]
164 #[derive(Clone, Copy, AsBytes, FromBytes)]
165 struct fsverity_enable_arg {
166     _version: u32,
167     _hash_algorithm: u32,
168     _block_size: u32,
169     salt_size: u32,
170     salt_ptr: u64,
171     sig_size: u32,
172     __reserved1: u32,
173     sig_ptr: u64,
174     __reserved2: [u64; 11],
175 }
176 
177 #[repr(C)]
178 #[derive(Clone, Copy, AsBytes, FromBytes)]
179 struct fsverity_digest {
180     _digest_algorithm: u16,
181     digest_size: u16,
182     // __u8 digest[];
183 }
184 
185 ioctl_iow_nr!(FS_IOC_ENABLE_VERITY, 'f' as u32, 133, fsverity_enable_arg);
186 ioctl_iowr_nr!(FS_IOC_MEASURE_VERITY, 'f' as u32, 134, fsverity_digest);
187 
188 pub type Inode = u64;
189 type Handle = u64;
190 
191 #[derive(Clone, Copy, Debug, PartialOrd, Ord, PartialEq, Eq)]
192 struct InodeAltKey {
193     ino: libc::ino64_t,
194     dev: libc::dev_t,
195 }
196 
197 #[derive(PartialEq, Eq, Debug)]
198 enum FileType {
199     Regular,
200     Directory,
201     Other,
202 }
203 
204 impl From<libc::mode_t> for FileType {
from(mode: libc::mode_t) -> Self205     fn from(mode: libc::mode_t) -> Self {
206         match mode & libc::S_IFMT {
207             libc::S_IFREG => FileType::Regular,
208             libc::S_IFDIR => FileType::Directory,
209             _ => FileType::Other,
210         }
211     }
212 }
213 
214 #[derive(Debug)]
215 struct InodeData {
216     inode: Inode,
217     // (File, open_flags)
218     file: Mutex<(File, libc::c_int)>,
219     refcount: AtomicU64,
220     filetype: FileType,
221 }
222 
223 impl AsRawDescriptor for InodeData {
as_raw_descriptor(&self) -> RawDescriptor224     fn as_raw_descriptor(&self) -> RawDescriptor {
225         self.file.lock().0.as_raw_descriptor()
226     }
227 }
228 
229 #[derive(Debug)]
230 struct HandleData {
231     inode: Inode,
232     file: Mutex<File>,
233 }
234 
235 impl AsRawDescriptor for HandleData {
as_raw_descriptor(&self) -> RawDescriptor236     fn as_raw_descriptor(&self) -> RawDescriptor {
237         self.file.lock().as_raw_descriptor()
238     }
239 }
240 
241 macro_rules! scoped_cred {
242     ($name:ident, $ty:ty, $syscall_nr:expr) => {
243         #[derive(Debug)]
244         struct $name {
245             old: $ty,
246         }
247 
248         impl $name {
249             // Changes the effective uid/gid of the current thread to `val`. Changes the thread's
250             // credentials back to `old` when the returned struct is dropped.
251             fn new(val: $ty, old: $ty) -> io::Result<Option<$name>> {
252                 if val == old {
253                     // Nothing to do since we already have the correct value.
254                     return Ok(None);
255                 }
256 
257                 // We want credential changes to be per-thread because otherwise
258                 // we might interfere with operations being carried out on other
259                 // threads with different uids/gids.  However, posix requires that
260                 // all threads in a process share the same credentials.  To do this
261                 // libc uses signals to ensure that when one thread changes its
262                 // credentials the other threads do the same thing.
263                 //
264                 // So instead we invoke the syscall directly in order to get around
265                 // this limitation.  Another option is to use the setfsuid and
266                 // setfsgid systems calls.   However since those calls have no way to
267                 // return an error, it's preferable to do this instead.
268 
269                 // This call is safe because it doesn't modify any memory and we
270                 // check the return value.
271                 let res = unsafe { libc::syscall($syscall_nr, -1, val, -1) };
272                 if res == 0 {
273                     Ok(Some($name { old }))
274                 } else {
275                     Err(io::Error::last_os_error())
276                 }
277             }
278         }
279 
280         impl Drop for $name {
281             fn drop(&mut self) {
282                 let res = unsafe { libc::syscall($syscall_nr, -1, self.old, -1) };
283                 if res < 0 {
284                     error!(
285                         "failed to change credentials back to {}: {}",
286                         self.old,
287                         io::Error::last_os_error(),
288                     );
289                 }
290             }
291         }
292     };
293 }
294 #[cfg(not(target_arch = "arm"))]
295 scoped_cred!(ScopedUid, libc::uid_t, libc::SYS_setresuid);
296 #[cfg(target_arch = "arm")]
297 scoped_cred!(ScopedUid, libc::uid_t, libc::SYS_setresuid32);
298 
299 #[cfg(not(target_arch = "arm"))]
300 scoped_cred!(ScopedGid, libc::gid_t, libc::SYS_setresgid);
301 #[cfg(target_arch = "arm")]
302 scoped_cred!(ScopedGid, libc::gid_t, libc::SYS_setresgid32);
303 
304 #[cfg(not(target_arch = "arm"))]
305 const SYS_GETEUID: libc::c_long = libc::SYS_geteuid;
306 #[cfg(target_arch = "arm")]
307 const SYS_GETEUID: libc::c_long = libc::SYS_geteuid32;
308 
309 #[cfg(not(target_arch = "arm"))]
310 const SYS_GETEGID: libc::c_long = libc::SYS_getegid;
311 #[cfg(target_arch = "arm")]
312 const SYS_GETEGID: libc::c_long = libc::SYS_getegid32;
313 
314 thread_local! {
315     // Both these calls are safe because they take no parameters, and only return an integer value.
316     // The kernel also guarantees that they can never fail.
317     static THREAD_EUID: libc::uid_t = unsafe { libc::syscall(SYS_GETEUID) as libc::uid_t };
318     static THREAD_EGID: libc::gid_t = unsafe { libc::syscall(SYS_GETEGID) as libc::gid_t };
319 }
320 
set_creds( uid: libc::uid_t, gid: libc::gid_t, ) -> io::Result<(Option<ScopedUid>, Option<ScopedGid>)>321 fn set_creds(
322     uid: libc::uid_t,
323     gid: libc::gid_t,
324 ) -> io::Result<(Option<ScopedUid>, Option<ScopedGid>)> {
325     let olduid = THREAD_EUID.with(|uid| *uid);
326     let oldgid = THREAD_EGID.with(|gid| *gid);
327 
328     // We have to change the gid before we change the uid because if we change the uid first then we
329     // lose the capability to change the gid.  However changing back can happen in any order.
330     ScopedGid::new(gid, oldgid).and_then(|gid| Ok((ScopedUid::new(uid, olduid)?, gid)))
331 }
332 
333 struct ScopedUmask {
334     old: libc::mode_t,
335     mask: libc::mode_t,
336 }
337 
338 impl ScopedUmask {
new(mask: libc::mode_t) -> ScopedUmask339     fn new(mask: libc::mode_t) -> ScopedUmask {
340         ScopedUmask {
341             // Safe because this doesn't modify any memory and always succeeds.
342             old: unsafe { libc::umask(mask) },
343             mask,
344         }
345     }
346 }
347 
348 impl Drop for ScopedUmask {
drop(&mut self)349     fn drop(&mut self) {
350         // Safe because this doesn't modify any memory and always succeeds.
351         let previous = unsafe { libc::umask(self.old) };
352         debug_assert_eq!(
353             previous, self.mask,
354             "umask changed while holding ScopedUmask"
355         );
356     }
357 }
358 
359 struct ScopedFsetid(Caps);
360 impl Drop for ScopedFsetid {
drop(&mut self)361     fn drop(&mut self) {
362         if let Err(e) = raise_cap_fsetid(&mut self.0) {
363             error!(
364                 "Failed to restore CAP_FSETID: {}.  Some operations may be broken.",
365                 e
366             )
367         }
368     }
369 }
370 
raise_cap_fsetid(c: &mut Caps) -> io::Result<()>371 fn raise_cap_fsetid(c: &mut Caps) -> io::Result<()> {
372     c.update(&[Capability::Fsetid], CapSet::Effective, CapValue::Set)?;
373     c.apply()
374 }
375 
376 // Drops CAP_FSETID from the effective set for the current thread and returns an RAII guard that
377 // adds the capability back when it is dropped.
drop_cap_fsetid() -> io::Result<ScopedFsetid>378 fn drop_cap_fsetid() -> io::Result<ScopedFsetid> {
379     let mut caps = Caps::for_current_thread()?;
380     caps.update(&[Capability::Fsetid], CapSet::Effective, CapValue::Clear)?;
381     caps.apply()?;
382     Ok(ScopedFsetid(caps))
383 }
384 
ebadf() -> io::Error385 fn ebadf() -> io::Error {
386     io::Error::from_raw_os_error(libc::EBADF)
387 }
388 
stat<F: AsRawDescriptor + ?Sized>(f: &F) -> io::Result<libc::stat64>389 fn stat<F: AsRawDescriptor + ?Sized>(f: &F) -> io::Result<libc::stat64> {
390     let mut st = MaybeUninit::<libc::stat64>::zeroed();
391 
392     // Safe because this is a constant value and a valid C string.
393     let pathname = unsafe { CStr::from_bytes_with_nul_unchecked(EMPTY_CSTR) };
394 
395     // Safe because the kernel will only write data in `st` and we check the return
396     // value.
397     syscall!(unsafe {
398         libc::fstatat64(
399             f.as_raw_descriptor(),
400             pathname.as_ptr(),
401             st.as_mut_ptr(),
402             libc::AT_EMPTY_PATH | libc::AT_SYMLINK_NOFOLLOW,
403         )
404     })?;
405 
406     // Safe because the kernel guarantees that the struct is now fully initialized.
407     Ok(unsafe { st.assume_init() })
408 }
409 
statat<D: AsRawDescriptor>(dir: &D, name: &CStr) -> io::Result<libc::stat64>410 fn statat<D: AsRawDescriptor>(dir: &D, name: &CStr) -> io::Result<libc::stat64> {
411     let mut st = MaybeUninit::<libc::stat64>::zeroed();
412 
413     // Safe because the kernel will only write data in `st` and we check the return
414     // value.
415     syscall!(unsafe {
416         libc::fstatat64(
417             dir.as_raw_descriptor(),
418             name.as_ptr(),
419             st.as_mut_ptr(),
420             libc::AT_SYMLINK_NOFOLLOW,
421         )
422     })?;
423 
424     // Safe because the kernel guarantees that the struct is now fully initialized.
425     Ok(unsafe { st.assume_init() })
426 }
427 
428 /// The caching policy that the file system should report to the FUSE client. By default the FUSE
429 /// protocol uses close-to-open consistency. This means that any cached contents of the file are
430 /// invalidated the next time that file is opened.
431 #[derive(Debug, Clone, Default, Eq, PartialEq, Serialize, Deserialize)]
432 pub enum CachePolicy {
433     /// The client should never cache file data and all I/O should be directly forwarded to the
434     /// server. This policy must be selected when file contents may change without the knowledge of
435     /// the FUSE client (i.e., the file system does not have exclusive access to the directory).
436     Never,
437 
438     /// The client is free to choose when and how to cache file data. This is the default policy and
439     /// uses close-to-open consistency as described in the enum documentation.
440     #[default]
441     Auto,
442 
443     /// The client should always cache file data. This means that the FUSE client will not
444     /// invalidate any cached data that was returned by the file system the last time the file was
445     /// opened. This policy should only be selected when the file system has exclusive access to the
446     /// directory.
447     Always,
448 }
449 
450 impl FromStr for CachePolicy {
451     type Err = &'static str;
452 
from_str(s: &str) -> Result<Self, Self::Err>453     fn from_str(s: &str) -> Result<Self, Self::Err> {
454         match s {
455             "never" | "Never" | "NEVER" => Ok(CachePolicy::Never),
456             "auto" | "Auto" | "AUTO" => Ok(CachePolicy::Auto),
457             "always" | "Always" | "ALWAYS" => Ok(CachePolicy::Always),
458             _ => Err("invalid cache policy"),
459         }
460     }
461 }
462 
463 /// Options that configure the behavior of the file system.
464 #[derive(Debug, Clone, Serialize, Deserialize)]
465 pub struct Config {
466     /// How long the FUSE client should consider directory entries to be valid. If the contents of a
467     /// directory can only be modified by the FUSE client (i.e., the file system has exclusive
468     /// access), then this should be a large value.
469     ///
470     /// The default value for this option is 5 seconds.
471     pub entry_timeout: Duration,
472 
473     /// How long the FUSE client should consider file and directory attributes to be valid. If the
474     /// attributes of a file or directory can only be modified by the FUSE client (i.e., the file
475     /// system has exclusive access), then this should be set to a large value.
476     ///
477     /// The default value for this option is 5 seconds.
478     pub attr_timeout: Duration,
479 
480     /// The caching policy the file system should use. See the documentation of `CachePolicy` for
481     /// more details.
482     pub cache_policy: CachePolicy,
483 
484     /// Whether the file system should enabled writeback caching. This can improve performance as it
485     /// allows the FUSE client to cache and coalesce multiple writes before sending them to the file
486     /// system. However, enabling this option can increase the risk of data corruption if the file
487     /// contents can change without the knowledge of the FUSE client (i.e., the server does **NOT**
488     /// have exclusive access). Additionally, the file system should have read access to all files
489     /// in the directory it is serving as the FUSE client may send read requests even for files
490     /// opened with `O_WRONLY`.
491     ///
492     /// Therefore callers should only enable this option when they can guarantee that: 1) the file
493     /// system has exclusive access to the directory and 2) the file system has read permissions for
494     /// all files in that directory.
495     ///
496     /// The default value for this option is `false`.
497     pub writeback: bool,
498 
499     /// Controls whether security.* xattrs (except for security.selinux) are re-written. When this
500     /// is set to true, the server will add a "user.virtiofs" prefix to xattrs in the security
501     /// namespace. Setting these xattrs requires CAP_SYS_ADMIN in the namespace where the file
502     /// system was mounted and since the server usually runs in an unprivileged user namespace, it's
503     /// unlikely to have that capability.
504     ///
505     /// The default value for this option is `false`.
506     pub rewrite_security_xattrs: bool,
507 
508     /// Use case-insensitive lookups for directory entries (ASCII only).
509     ///
510     /// The default value for this option is `false`.
511     pub ascii_casefold: bool,
512 
513     // UIDs which are privileged to perform quota-related operations. We cannot perform a CAP_FOWNER
514     // check so we consult this list when the VM tries to set the project quota and the process uid
515     // doesn't match the owner uid. In that case, all uids in this list are treated as if they have
516     // CAP_FOWNER.
517     #[cfg(feature = "arc_quota")]
518     pub privileged_quota_uids: Vec<libc::uid_t>,
519 
520     /// Use DAX for shared files.
521     ///
522     /// Enabling DAX can improve performance for frequently accessed files by mapping regions of the
523     /// file directly into the VM's memory region, allowing direct access with the cost of slightly
524     /// increased latency the first time the file is accessed. Additionally, since the mapping is
525     /// shared directly from the host kernel's file cache, enabling DAX can improve performance even
526     /// when the cache policy is `Never`.
527     ///
528     /// The default value for this option is `false`.
529     pub use_dax: bool,
530 
531     /// Enable support for POSIX acls.
532     ///
533     /// Enable POSIX acl support for the shared directory. This requires that the underlying file
534     /// system also supports POSIX acls.
535     ///
536     /// The default value for this option is `true`.
537     pub posix_acl: bool,
538 }
539 
540 impl Default for Config {
default() -> Self541     fn default() -> Self {
542         Config {
543             entry_timeout: Duration::from_secs(5),
544             attr_timeout: Duration::from_secs(5),
545             cache_policy: Default::default(),
546             writeback: false,
547             rewrite_security_xattrs: false,
548             ascii_casefold: false,
549             #[cfg(feature = "arc_quota")]
550             privileged_quota_uids: Default::default(),
551             use_dax: false,
552             posix_acl: true,
553         }
554     }
555 }
556 
557 impl FromStr for Config {
558     type Err = &'static str;
559 
from_str(params: &str) -> Result<Self, Self::Err>560     fn from_str(params: &str) -> Result<Self, Self::Err> {
561         let mut cfg = Self::default();
562         if params.is_empty() {
563             return Ok(cfg);
564         }
565         for opt in params.split(':') {
566             let mut o = opt.splitn(2, '=');
567             let kind = o.next().ok_or("`cfg` options mut not be empty")?;
568             let value = o
569                 .next()
570                 .ok_or("`cfg` options must be of the form `kind=value`")?;
571             match kind {
572                 #[cfg(feature = "arc_quota")]
573                 "privileged_quota_uids" => {
574                     cfg.privileged_quota_uids =
575                         value.split(' ').map(|s| s.parse().unwrap()).collect();
576                 }
577                 "timeout" => {
578                     let seconds = value.parse().map_err(|_| "`timeout` must be an integer")?;
579 
580                     let dur = Duration::from_secs(seconds);
581                     cfg.entry_timeout = dur;
582                     cfg.attr_timeout = dur;
583                 }
584                 "cache" => {
585                     let policy = value
586                         .parse()
587                         .map_err(|_| "`cache` must be one of `never`, `always`, or `auto`")?;
588                     cfg.cache_policy = policy;
589                 }
590                 "writeback" => {
591                     let writeback = value.parse().map_err(|_| "`writeback` must be a boolean")?;
592                     cfg.writeback = writeback;
593                 }
594                 "rewrite-security-xattrs" => {
595                     let rewrite_security_xattrs = value
596                         .parse()
597                         .map_err(|_| "`rewrite-security-xattrs` must be a boolean")?;
598                     cfg.rewrite_security_xattrs = rewrite_security_xattrs;
599                 }
600                 "ascii_casefold" => {
601                     let ascii_casefold = value
602                         .parse()
603                         .map_err(|_| "`ascii_casefold` must be a boolean")?;
604                     cfg.ascii_casefold = ascii_casefold;
605                 }
606                 "dax" => {
607                     let use_dax = value.parse().map_err(|_| "`dax` must be a boolean")?;
608                     cfg.use_dax = use_dax;
609                 }
610                 "posix_acl" => {
611                     let posix_acl = value.parse().map_err(|_| "`posix_acl` must be a boolean")?;
612                     cfg.posix_acl = posix_acl;
613                 }
614                 _ => return Err("unrecognized option for virtio-fs config"),
615             }
616         }
617         Ok(cfg)
618     }
619 }
620 
621 /// A file system that simply "passes through" all requests it receives to the underlying file
622 /// system. To keep the implementation simple it servers the contents of its root directory. Users
623 /// that wish to serve only a specific directory should set up the environment so that that
624 /// directory ends up as the root of the file system process. One way to accomplish this is via a
625 /// combination of mount namespaces and the pivot_root system call.
626 pub struct PassthroughFs {
627     // Mutex that must be acquired before executing a process-wide operation such as fchdir.
628     process_lock: Mutex<()>,
629     // virtio-fs tag that the guest uses when mounting. This is only used for debugging
630     // when tracing is enabled.
631     #[cfg_attr(not(feature = "trace_marker"), allow(dead_code))]
632     tag: String,
633 
634     // File descriptors for various points in the file system tree.
635     inodes: Mutex<MultikeyBTreeMap<Inode, InodeAltKey, Arc<InodeData>>>,
636     next_inode: AtomicU64,
637 
638     // File descriptors for open files and directories. Unlike the fds in `inodes`, these _can_ be
639     // used for reading and writing data.
640     handles: Mutex<BTreeMap<Handle, Arc<HandleData>>>,
641     next_handle: AtomicU64,
642 
643     // File descriptor pointing to the `/proc` directory. This is used to convert an fd from
644     // `inodes` into one that can go into `handles`. This is accomplished by reading the
645     // `self/fd/{}` symlink. We keep an open fd here in case the file system tree that we are meant
646     // to be serving doesn't have access to `/proc`.
647     proc: File,
648 
649     // Whether writeback caching is enabled for this directory. This will only be true when
650     // `cfg.writeback` is true and `init` was called with `FsOptions::WRITEBACK_CACHE`.
651     writeback: AtomicBool,
652 
653     // Whether zero message opens are supported by the kernel driver.
654     zero_message_open: AtomicBool,
655 
656     // Whether zero message opendir is supported by the kernel driver.
657     zero_message_opendir: AtomicBool,
658 
659     // Used to communicate with other processes using D-Bus.
660     #[cfg(feature = "arc_quota")]
661     dbus_connection: Option<Mutex<dbus::blocking::Connection>>,
662     #[cfg(feature = "arc_quota")]
663     dbus_fd: Option<std::os::unix::io::RawFd>,
664 
665     cfg: Config,
666 }
667 
668 impl std::fmt::Debug for PassthroughFs {
fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result669     fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
670         f.debug_struct("PassthroughFs")
671             .field("tag", &self.tag)
672             .field("next_inode", &self.next_inode)
673             .field("next_handle", &self.next_handle)
674             .field("proc", &self.proc)
675             .field("writeback", &self.writeback)
676             .field("zero_message_open", &self.zero_message_open)
677             .field("zero_message_opendir", &self.zero_message_opendir)
678             .field("cfg", &self.cfg)
679             .finish()
680     }
681 }
682 
683 impl PassthroughFs {
new(tag: &str, cfg: Config) -> io::Result<PassthroughFs>684     pub fn new(tag: &str, cfg: Config) -> io::Result<PassthroughFs> {
685         // Safe because this is a constant value and a valid C string.
686         let proc_cstr = unsafe { CStr::from_bytes_with_nul_unchecked(PROC_CSTR) };
687 
688         // Safe because this doesn't modify any memory and we check the return value.
689         let raw_descriptor = syscall!(unsafe {
690             libc::openat64(
691                 libc::AT_FDCWD,
692                 proc_cstr.as_ptr(),
693                 libc::O_PATH | libc::O_NOFOLLOW | libc::O_CLOEXEC,
694             )
695         })?;
696 
697         // Privileged UIDs can use D-Bus to perform some operations.
698         #[cfg(feature = "arc_quota")]
699         let (dbus_connection, dbus_fd) = if cfg.privileged_quota_uids.is_empty() {
700             (None, None)
701         } else {
702             let mut channel = dbus::channel::Channel::get_private(dbus::channel::BusType::System)
703                 .map_err(|e| io::Error::new(io::ErrorKind::Other, e))?;
704             channel.set_watch_enabled(true);
705             let dbus_fd = channel.watch().fd;
706             channel.set_watch_enabled(false);
707             (
708                 Some(Mutex::new(dbus::blocking::Connection::from(channel))),
709                 Some(dbus_fd),
710             )
711         };
712 
713         // Safe because we just opened this descriptor.
714         let proc = unsafe { File::from_raw_descriptor(raw_descriptor) };
715 
716         let passthroughfs = PassthroughFs {
717             process_lock: Mutex::new(()),
718             tag: tag.to_string(),
719             inodes: Mutex::new(MultikeyBTreeMap::new()),
720             next_inode: AtomicU64::new(ROOT_ID + 1),
721 
722             handles: Mutex::new(BTreeMap::new()),
723             next_handle: AtomicU64::new(1),
724 
725             proc,
726 
727             writeback: AtomicBool::new(false),
728             zero_message_open: AtomicBool::new(false),
729             zero_message_opendir: AtomicBool::new(false),
730 
731             #[cfg(feature = "arc_quota")]
732             dbus_connection,
733             #[cfg(feature = "arc_quota")]
734             dbus_fd,
735 
736             cfg,
737         };
738 
739         cros_tracing::trace_simple_print!("New PassthroughFS initialized: {:?}", passthroughfs);
740         Ok(passthroughfs)
741     }
742 
cfg(&self) -> &Config743     pub fn cfg(&self) -> &Config {
744         &self.cfg
745     }
746 
keep_rds(&self) -> Vec<RawDescriptor>747     pub fn keep_rds(&self) -> Vec<RawDescriptor> {
748         #[cfg_attr(not(feature = "arc_quota"), allow(unused_mut))]
749         let mut keep_rds = vec![self.proc.as_raw_descriptor()];
750         #[cfg(feature = "arc_quota")]
751         if let Some(fd) = self.dbus_fd {
752             keep_rds.push(fd);
753         }
754         keep_rds
755     }
756 
rewrite_xattr_name<'xattr>(&self, name: &'xattr CStr) -> Cow<'xattr, CStr>757     fn rewrite_xattr_name<'xattr>(&self, name: &'xattr CStr) -> Cow<'xattr, CStr> {
758         if !self.cfg.rewrite_security_xattrs {
759             return Cow::Borrowed(name);
760         }
761 
762         // Does not include nul-terminator.
763         let buf = name.to_bytes();
764         if !buf.starts_with(SECURITY_XATTR) || buf == SELINUX_XATTR {
765             return Cow::Borrowed(name);
766         }
767 
768         let mut newname = USER_VIRTIOFS_XATTR.to_vec();
769         newname.extend_from_slice(buf);
770 
771         // The unwrap is safe here because the prefix doesn't contain any interior nul-bytes and the
772         // to_bytes() call above will not return a byte slice with any interior nul-bytes either.
773         Cow::Owned(CString::new(newname).expect("Failed to re-write xattr name"))
774     }
775 
find_inode(&self, inode: Inode) -> io::Result<Arc<InodeData>>776     fn find_inode(&self, inode: Inode) -> io::Result<Arc<InodeData>> {
777         self.inodes
778             .lock()
779             .get(&inode)
780             .map(Arc::clone)
781             .ok_or_else(ebadf)
782     }
783 
find_handle(&self, handle: Handle, inode: Inode) -> io::Result<Arc<HandleData>>784     fn find_handle(&self, handle: Handle, inode: Inode) -> io::Result<Arc<HandleData>> {
785         self.handles
786             .lock()
787             .get(&handle)
788             .filter(|hd| hd.inode == inode)
789             .map(Arc::clone)
790             .ok_or_else(ebadf)
791     }
792 
open_fd(&self, fd: RawDescriptor, flags: i32) -> io::Result<File>793     fn open_fd(&self, fd: RawDescriptor, flags: i32) -> io::Result<File> {
794         let pathname = CString::new(format!("self/fd/{}", fd))
795             .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
796 
797         // Safe because this doesn't modify any memory and we check the return value. We don't
798         // really check `flags` because if the kernel can't handle poorly specified flags then we
799         // have much bigger problems. Also, clear the `O_NOFOLLOW` flag if it is set since we need
800         // to follow the `/proc/self/fd` symlink to get the file.
801         let raw_descriptor = syscall!(unsafe {
802             libc::openat64(
803                 self.proc.as_raw_descriptor(),
804                 pathname.as_ptr(),
805                 (flags | libc::O_CLOEXEC) & !(libc::O_NOFOLLOW | libc::O_DIRECT),
806             )
807         })?;
808 
809         // Safe because we just opened this descriptor.
810         Ok(unsafe { File::from_raw_descriptor(raw_descriptor) })
811     }
812 
open_inode(&self, inode: &InodeData, mut flags: i32) -> io::Result<File>813     fn open_inode(&self, inode: &InodeData, mut flags: i32) -> io::Result<File> {
814         // When writeback caching is enabled, the kernel may send read requests even if the
815         // userspace program opened the file write-only. So we need to ensure that we have opened
816         // the file for reading as well as writing.
817         let writeback = self.writeback.load(Ordering::Relaxed);
818         if writeback && flags & libc::O_ACCMODE == libc::O_WRONLY {
819             flags &= !libc::O_ACCMODE;
820             flags |= libc::O_RDWR;
821         }
822 
823         // When writeback caching is enabled the kernel is responsible for handling `O_APPEND`.
824         // However, this breaks atomicity as the file may have changed on disk, invalidating the
825         // cached copy of the data in the kernel and the offset that the kernel thinks is the end of
826         // the file. Just allow this for now as it is the user's responsibility to enable writeback
827         // caching only for directories that are not shared. It also means that we need to clear the
828         // `O_APPEND` flag.
829         if writeback && flags & libc::O_APPEND != 0 {
830             flags &= !libc::O_APPEND;
831         }
832 
833         self.open_fd(inode.as_raw_descriptor(), flags)
834     }
835 
836     // Increases the inode refcount and returns the inode.
increase_inode_refcount(&self, inode_data: &InodeData) -> Inode837     fn increase_inode_refcount(&self, inode_data: &InodeData) -> Inode {
838         // Matches with the release store in `forget`.
839         inode_data.refcount.fetch_add(1, Ordering::Acquire);
840         inode_data.inode
841     }
842 
843     // Creates a new entry for `f` or increases the refcount of the existing entry for `f`.
844     // The inodes mutex lock must not be already taken by the same thread otherwise this
845     // will deadlock.
add_entry(&self, f: File, st: libc::stat64, open_flags: libc::c_int) -> Entry846     fn add_entry(&self, f: File, st: libc::stat64, open_flags: libc::c_int) -> Entry {
847         let mut inodes = self.inodes.lock();
848 
849         let altkey = InodeAltKey {
850             ino: st.st_ino,
851             dev: st.st_dev,
852         };
853 
854         let inode = if let Some(data) = inodes.get_alt(&altkey) {
855             self.increase_inode_refcount(data)
856         } else {
857             let inode = self.next_inode.fetch_add(1, Ordering::Relaxed);
858             inodes.insert(
859                 inode,
860                 altkey,
861                 Arc::new(InodeData {
862                     inode,
863                     file: Mutex::new((f, open_flags)),
864                     refcount: AtomicU64::new(1),
865                     filetype: st.st_mode.into(),
866                 }),
867             );
868 
869             inode
870         };
871 
872         Entry {
873             inode,
874             generation: 0,
875             attr: st,
876             attr_timeout: self.cfg.attr_timeout,
877             entry_timeout: self.cfg.entry_timeout,
878         }
879     }
880 
881     // Performs an ascii case insensitive lookup.
ascii_casefold_lookup(&self, parent: &InodeData, name: &[u8]) -> io::Result<Entry>882     fn ascii_casefold_lookup(&self, parent: &InodeData, name: &[u8]) -> io::Result<Entry> {
883         let mut buf = [0u8; 1024];
884         let mut offset = 0;
885         loop {
886             let mut read_dir = ReadDir::new(parent, offset, &mut buf[..])?;
887             if read_dir.remaining() == 0 {
888                 break;
889             }
890 
891             while let Some(entry) = read_dir.next() {
892                 offset = entry.offset as libc::off64_t;
893                 if name.eq_ignore_ascii_case(entry.name.to_bytes()) {
894                     return self.do_lookup(parent, entry.name);
895                 }
896             }
897         }
898         Err(io::Error::from_raw_os_error(libc::ENOENT))
899     }
900 
do_lookup(&self, parent: &InodeData, name: &CStr) -> io::Result<Entry>901     fn do_lookup(&self, parent: &InodeData, name: &CStr) -> io::Result<Entry> {
902         let st = statat(parent, name)?;
903 
904         let altkey = InodeAltKey {
905             ino: st.st_ino,
906             dev: st.st_dev,
907         };
908 
909         // Check if we already have an entry before opening a new file.
910         if let Some(data) = self.inodes.lock().get_alt(&altkey) {
911             // Return the same inode with the reference counter increased.
912             return Ok(Entry {
913                 inode: self.increase_inode_refcount(data),
914                 generation: 0,
915                 attr: st,
916                 attr_timeout: self.cfg.attr_timeout,
917                 entry_timeout: self.cfg.entry_timeout,
918             });
919         }
920 
921         // Open a regular file with O_RDONLY to store in `InodeData` so explicit open requests can
922         // be skipped later if the ZERO_MESSAGE_{OPEN,OPENDIR} features are enabled.
923         // If the crosvm process doesn't have a read permission, fall back to O_PATH below.
924         let mut flags = libc::O_RDONLY | libc::O_NOFOLLOW | libc::O_CLOEXEC;
925         match FileType::from(st.st_mode) {
926             FileType::Regular => {}
927             FileType::Directory => flags |= libc::O_DIRECTORY,
928             FileType::Other => flags |= libc::O_PATH,
929         };
930 
931         // Safe because this doesn't modify any memory and we check the return value.
932         let fd = match unsafe {
933             syscall!(libc::openat64(
934                 parent.as_raw_descriptor(),
935                 name.as_ptr(),
936                 flags
937             ))
938         } {
939             Ok(fd) => fd,
940             Err(e) if e.errno() == libc::EACCES => {
941                 // If O_RDONLY is unavailable, fall back to O_PATH to get an FD to store in
942                 // `InodeData`.
943                 // Note that some operations which should be allowed without read permissions
944                 // require syscalls that don't support O_PATH fds. For those syscalls, we will
945                 // need to fall back to their path-based equivalents with /self/fd/${FD}.
946                 // e.g. `fgetxattr()` for an O_PATH FD fails while `getxaattr()` for /self/fd/${FD}
947                 // works.
948                 flags |= libc::O_PATH;
949                 // Safe because this doesn't modify any memory and we check the return value.
950                 unsafe {
951                     syscall!(libc::openat64(
952                         parent.as_raw_descriptor(),
953                         name.as_ptr(),
954                         flags
955                     ))
956                 }?
957             }
958             Err(e) => {
959                 return Err(e.into());
960             }
961         };
962 
963         // Safe because we own the fd.
964         let f = unsafe { File::from_raw_descriptor(fd) };
965         // We made sure the lock acquired for `self.inodes` is released automatically when
966         // the if block above is exited, so a call to `self.add_entry()` should not cause a deadlock
967         // here. This would not be the case if this were executed in an else block instead.
968         Ok(self.add_entry(f, st, flags))
969     }
970 
do_open(&self, inode: Inode, flags: u32) -> io::Result<(Option<Handle>, OpenOptions)>971     fn do_open(&self, inode: Inode, flags: u32) -> io::Result<(Option<Handle>, OpenOptions)> {
972         let inode_data = self.find_inode(inode)?;
973 
974         let file = Mutex::new(self.open_inode(&inode_data, flags as i32)?);
975 
976         let handle = self.next_handle.fetch_add(1, Ordering::Relaxed);
977         let data = HandleData { inode, file };
978 
979         self.handles.lock().insert(handle, Arc::new(data));
980 
981         let mut opts = OpenOptions::empty();
982         match self.cfg.cache_policy {
983             // We only set the direct I/O option on files.
984             CachePolicy::Never => opts.set(
985                 OpenOptions::DIRECT_IO,
986                 flags & (libc::O_DIRECTORY as u32) == 0,
987             ),
988             CachePolicy::Always => {
989                 opts |= if flags & (libc::O_DIRECTORY as u32) == 0 {
990                     OpenOptions::KEEP_CACHE
991                 } else {
992                     OpenOptions::CACHE_DIR
993                 }
994             }
995             _ => {}
996         };
997 
998         Ok((Some(handle), opts))
999     }
1000 
do_release(&self, inode: Inode, handle: Handle) -> io::Result<()>1001     fn do_release(&self, inode: Inode, handle: Handle) -> io::Result<()> {
1002         let mut handles = self.handles.lock();
1003 
1004         if let btree_map::Entry::Occupied(e) = handles.entry(handle) {
1005             if e.get().inode == inode {
1006                 // We don't need to close the file here because that will happen automatically when
1007                 // the last `Arc` is dropped.
1008                 e.remove();
1009                 return Ok(());
1010             }
1011         }
1012 
1013         Err(ebadf())
1014     }
1015 
do_getattr(&self, inode: &InodeData) -> io::Result<(libc::stat64, Duration)>1016     fn do_getattr(&self, inode: &InodeData) -> io::Result<(libc::stat64, Duration)> {
1017         let st = stat(inode)?;
1018 
1019         Ok((st, self.cfg.attr_timeout))
1020     }
1021 
do_unlink(&self, parent: &InodeData, name: &CStr, flags: libc::c_int) -> io::Result<()>1022     fn do_unlink(&self, parent: &InodeData, name: &CStr, flags: libc::c_int) -> io::Result<()> {
1023         // Safe because this doesn't modify any memory and we check the return value.
1024         syscall!(unsafe { libc::unlinkat(parent.as_raw_descriptor(), name.as_ptr(), flags) })?;
1025         Ok(())
1026     }
1027 
do_fsync<F: AsRawDescriptor>(&self, file: &F, datasync: bool) -> io::Result<()>1028     fn do_fsync<F: AsRawDescriptor>(&self, file: &F, datasync: bool) -> io::Result<()> {
1029         // Safe because this doesn't modify any memory and we check the return value.
1030         syscall!(unsafe {
1031             if datasync {
1032                 libc::fdatasync(file.as_raw_descriptor())
1033             } else {
1034                 libc::fsync(file.as_raw_descriptor())
1035             }
1036         })?;
1037 
1038         Ok(())
1039     }
1040 
1041     // Changes the CWD to `self.proc`, runs `f`, and then changes the CWD back to the root
1042     // directory. This effectively emulates an *at syscall starting at /proc, which is useful when
1043     // there is no *at syscall available. Panics if any of the fchdir calls fail or if there is no
1044     // root inode.
1045     //
1046     // NOTE: this method acquires an `self`-wide lock. If any locks are acquired in `f`, care must
1047     // be taken to avoid the risk of deadlocks.
with_proc_chdir<F, T>(&self, f: F) -> T where F: FnOnce() -> T,1048     fn with_proc_chdir<F, T>(&self, f: F) -> T
1049     where
1050         F: FnOnce() -> T,
1051     {
1052         let root = self.find_inode(ROOT_ID).expect("failed to find root inode");
1053 
1054         // Acquire a lock for `fchdir`.
1055         let _proc_lock = self.process_lock.lock();
1056         // Safe because this doesn't modify any memory and we check the return value. Since the
1057         // fchdir should never fail we just use debug_asserts.
1058         let proc_cwd = unsafe { libc::fchdir(self.proc.as_raw_descriptor()) };
1059         debug_assert_eq!(
1060             proc_cwd,
1061             0,
1062             "failed to fchdir to /proc: {}",
1063             io::Error::last_os_error()
1064         );
1065 
1066         let res = f();
1067 
1068         // Safe because this doesn't modify any memory and we check the return value. Since the
1069         // fchdir should never fail we just use debug_asserts.
1070         let root_cwd = unsafe { libc::fchdir(root.as_raw_descriptor()) };
1071         debug_assert_eq!(
1072             root_cwd,
1073             0,
1074             "failed to fchdir back to root directory: {}",
1075             io::Error::last_os_error()
1076         );
1077 
1078         res
1079     }
1080 
do_getxattr(&self, inode: &InodeData, name: &CStr, value: &mut [u8]) -> io::Result<usize>1081     fn do_getxattr(&self, inode: &InodeData, name: &CStr, value: &mut [u8]) -> io::Result<usize> {
1082         let file = inode.file.lock();
1083         let o_path_file = (file.1 & libc::O_PATH) != 0;
1084         let res = if o_path_file {
1085             // For FDs opened with `O_PATH`, we cannot call `fgetxattr` normally. Instead we
1086             // emulate an _at syscall by changing the CWD to /proc, running the path based syscall,
1087             //  and then setting the CWD back to the root directory.
1088             let path = CString::new(format!("self/fd/{}", file.0.as_raw_descriptor()))
1089                 .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
1090 
1091             // Safe because this will only modify `value` and we check the return value.
1092             self.with_proc_chdir(|| unsafe {
1093                 libc::getxattr(
1094                     path.as_ptr(),
1095                     name.as_ptr(),
1096                     value.as_mut_ptr() as *mut libc::c_void,
1097                     value.len() as libc::size_t,
1098                 )
1099             })
1100         } else {
1101             // For regular files and directories, we can just use fgetxattr. Safe because this will
1102             // only write to `value` and we check the return value.
1103             unsafe {
1104                 libc::fgetxattr(
1105                     file.0.as_raw_descriptor(),
1106                     name.as_ptr(),
1107                     value.as_mut_ptr() as *mut libc::c_void,
1108                     value.len() as libc::size_t,
1109                 )
1110             }
1111         };
1112 
1113         if res < 0 {
1114             Err(io::Error::last_os_error())
1115         } else {
1116             Ok(res as usize)
1117         }
1118     }
1119 
get_encryption_policy_ex<R: io::Read>( &self, inode: Inode, handle: Handle, mut r: R, ) -> io::Result<IoctlReply>1120     fn get_encryption_policy_ex<R: io::Read>(
1121         &self,
1122         inode: Inode,
1123         handle: Handle,
1124         mut r: R,
1125     ) -> io::Result<IoctlReply> {
1126         let data: Arc<dyn AsRawDescriptor> = if self.zero_message_open.load(Ordering::Relaxed) {
1127             self.find_inode(inode)?
1128         } else {
1129             self.find_handle(handle, inode)?
1130         };
1131 
1132         // Safe because this only has integer fields.
1133         let mut arg = unsafe { MaybeUninit::<fscrypt_get_policy_ex_arg>::zeroed().assume_init() };
1134         r.read_exact(arg.policy_size.as_bytes_mut())?;
1135 
1136         let policy_size = cmp::min(arg.policy_size, size_of::<fscrypt_policy>() as u64);
1137         arg.policy_size = policy_size;
1138 
1139         // Safe because the kernel will only write to `arg` and we check the return value.
1140         let res =
1141             unsafe { ioctl_with_mut_ptr(&*data, FS_IOC_GET_ENCRYPTION_POLICY_EX(), &mut arg) };
1142         if res < 0 {
1143             Ok(IoctlReply::Done(Err(io::Error::last_os_error())))
1144         } else {
1145             let len = size_of::<u64>() + arg.policy_size as usize;
1146             Ok(IoctlReply::Done(Ok(arg.as_slice()[..len].to_vec())))
1147         }
1148     }
1149 
get_fsxattr(&self, inode: Inode, handle: Handle) -> io::Result<IoctlReply>1150     fn get_fsxattr(&self, inode: Inode, handle: Handle) -> io::Result<IoctlReply> {
1151         let data: Arc<dyn AsRawDescriptor> = if self.zero_message_open.load(Ordering::Relaxed) {
1152             self.find_inode(inode)?
1153         } else {
1154             self.find_handle(handle, inode)?
1155         };
1156 
1157         let mut buf = MaybeUninit::<fsxattr>::zeroed();
1158 
1159         // Safe because the kernel will only write to `buf` and we check the return value.
1160         let res = unsafe { ioctl_with_mut_ptr(&*data, FS_IOC_FSGETXATTR(), buf.as_mut_ptr()) };
1161         if res < 0 {
1162             Ok(IoctlReply::Done(Err(io::Error::last_os_error())))
1163         } else {
1164             // Safe because the kernel guarantees that the policy is now initialized.
1165             let xattr = unsafe { buf.assume_init() };
1166             Ok(IoctlReply::Done(Ok(xattr.as_bytes().to_vec())))
1167         }
1168     }
1169 
set_fsxattr<R: io::Read>( &self, #[cfg_attr(not(feature = "arc_quota"), allow(unused_variables))] ctx: Context, inode: Inode, handle: Handle, r: R, ) -> io::Result<IoctlReply>1170     fn set_fsxattr<R: io::Read>(
1171         &self,
1172         #[cfg_attr(not(feature = "arc_quota"), allow(unused_variables))] ctx: Context,
1173         inode: Inode,
1174         handle: Handle,
1175         r: R,
1176     ) -> io::Result<IoctlReply> {
1177         let data: Arc<dyn AsRawDescriptor> = if self.zero_message_open.load(Ordering::Relaxed) {
1178             self.find_inode(inode)?
1179         } else {
1180             self.find_handle(handle, inode)?
1181         };
1182 
1183         let in_attr: fsxattr = zerocopy_from_reader(r)?;
1184 
1185         #[cfg(feature = "arc_quota")]
1186         let st = stat(&*data)?;
1187 
1188         // Changing quota project ID requires CAP_FOWNER or being file owner.
1189         // Here we use privileged_quota_uids because we cannot perform a CAP_FOWNER check.
1190         #[cfg(feature = "arc_quota")]
1191         if ctx.uid == st.st_uid || self.cfg.privileged_quota_uids.contains(&ctx.uid) {
1192             // Get the current fsxattr.
1193             let mut buf = MaybeUninit::<fsxattr>::zeroed();
1194             // Safe because the kernel will only write to `buf` and we check the return value.
1195             let res = unsafe { ioctl_with_mut_ptr(&*data, FS_IOC_FSGETXATTR(), buf.as_mut_ptr()) };
1196             if res < 0 {
1197                 return Ok(IoctlReply::Done(Err(io::Error::last_os_error())));
1198             }
1199             // Safe because the kernel guarantees that the policy is now initialized.
1200             let current_attr = unsafe { buf.assume_init() };
1201 
1202             // Project ID cannot be changed inside a user namespace.
1203             // Use UserDataAuth to avoid this restriction.
1204             if current_attr.fsx_projid != in_attr.fsx_projid {
1205                 let connection = self.dbus_connection.as_ref().unwrap().lock();
1206                 let proxy = connection.with_proxy(
1207                     "org.chromium.UserDataAuth",
1208                     "/org/chromium/UserDataAuth",
1209                     DEFAULT_DBUS_TIMEOUT,
1210                 );
1211                 let mut proto: SetMediaRWDataFileProjectIdRequest = Message::new();
1212                 proto.project_id = in_attr.fsx_projid;
1213                 // Safe because data is a valid file descriptor.
1214                 let fd = unsafe { dbus::arg::OwnedFd::new(base::clone_descriptor(&*data)?) };
1215                 match proxy.set_media_rwdata_file_project_id(fd, proto.write_to_bytes().unwrap()) {
1216                     Ok(r) => {
1217                         let r = SetMediaRWDataFileProjectIdReply::parse_from_bytes(&r)
1218                             .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
1219                         if !r.success {
1220                             return Ok(IoctlReply::Done(Err(io::Error::from_raw_os_error(
1221                                 r.error,
1222                             ))));
1223                         }
1224                     }
1225                     Err(e) => {
1226                         return Err(io::Error::new(io::ErrorKind::Other, e));
1227                     }
1228                 };
1229             }
1230         }
1231 
1232         //  Safe because this doesn't modify any memory and we check the return value.
1233         let res = unsafe { ioctl_with_ptr(&*data, FS_IOC_FSSETXATTR(), &in_attr) };
1234         if res < 0 {
1235             Ok(IoctlReply::Done(Err(io::Error::last_os_error())))
1236         } else {
1237             Ok(IoctlReply::Done(Ok(Vec::new())))
1238         }
1239     }
1240 
get_flags(&self, inode: Inode, handle: Handle) -> io::Result<IoctlReply>1241     fn get_flags(&self, inode: Inode, handle: Handle) -> io::Result<IoctlReply> {
1242         let data: Arc<dyn AsRawDescriptor> = if self.zero_message_open.load(Ordering::Relaxed) {
1243             self.find_inode(inode)?
1244         } else {
1245             self.find_handle(handle, inode)?
1246         };
1247 
1248         // The ioctl encoding is a long but the parameter is actually an int.
1249         let mut flags: c_int = 0;
1250 
1251         // Safe because the kernel will only write to `flags` and we check the return value.
1252         let res = unsafe { ioctl_with_mut_ptr(&*data, FS_IOC_GETFLAGS(), &mut flags) };
1253         if res < 0 {
1254             Ok(IoctlReply::Done(Err(io::Error::last_os_error())))
1255         } else {
1256             Ok(IoctlReply::Done(Ok(flags.to_ne_bytes().to_vec())))
1257         }
1258     }
1259 
set_flags<R: io::Read>( &self, #[cfg_attr(not(feature = "arc_quota"), allow(unused_variables))] ctx: Context, inode: Inode, handle: Handle, r: R, ) -> io::Result<IoctlReply>1260     fn set_flags<R: io::Read>(
1261         &self,
1262         #[cfg_attr(not(feature = "arc_quota"), allow(unused_variables))] ctx: Context,
1263         inode: Inode,
1264         handle: Handle,
1265         r: R,
1266     ) -> io::Result<IoctlReply> {
1267         let data: Arc<dyn AsRawDescriptor> = if self.zero_message_open.load(Ordering::Relaxed) {
1268             self.find_inode(inode)?
1269         } else {
1270             self.find_handle(handle, inode)?
1271         };
1272 
1273         // The ioctl encoding is a long but the parameter is actually an int.
1274         let in_flags: c_int = zerocopy_from_reader(r)?;
1275 
1276         #[cfg(feature = "arc_quota")]
1277         let st = stat(&*data)?;
1278 
1279         // Only privleged uid can perform FS_IOC_SETFLAGS through cryptohome.
1280         #[cfg(feature = "arc_quota")]
1281         if ctx.uid == st.st_uid || self.cfg.privileged_quota_uids.contains(&ctx.uid) {
1282             // Get the current flag.
1283             let mut buf = MaybeUninit::<c_int>::zeroed();
1284             // Safe because the kernel will only write to `buf` and we check the return value.
1285             let res = unsafe { ioctl_with_mut_ptr(&*data, FS_IOC_GETFLAGS(), buf.as_mut_ptr()) };
1286             if res < 0 {
1287                 return Ok(IoctlReply::Done(Err(io::Error::last_os_error())));
1288             }
1289             // Safe because the kernel guarantees that the policy is now initialized.
1290             let current_flags = unsafe { buf.assume_init() };
1291 
1292             // Project inheritance flag cannot be changed inside a user namespace.
1293             // Use UserDataAuth to avoid this restriction.
1294             if (in_flags & FS_PROJINHERIT_FL) != (current_flags & FS_PROJINHERIT_FL) {
1295                 let connection = self.dbus_connection.as_ref().unwrap().lock();
1296                 let proxy = connection.with_proxy(
1297                     "org.chromium.UserDataAuth",
1298                     "/org/chromium/UserDataAuth",
1299                     DEFAULT_DBUS_TIMEOUT,
1300                 );
1301                 let mut proto: SetMediaRWDataFileProjectInheritanceFlagRequest = Message::new();
1302                 // If the input flags contain FS_PROJINHERIT_FL, then it is a set. Otherwise it is a
1303                 // reset.
1304                 proto.enable = (in_flags & FS_PROJINHERIT_FL) == FS_PROJINHERIT_FL;
1305                 // Safe because data is a valid file descriptor.
1306                 let fd = unsafe { dbus::arg::OwnedFd::new(base::clone_descriptor(&*data)?) };
1307                 match proxy.set_media_rwdata_file_project_inheritance_flag(
1308                     fd,
1309                     proto.write_to_bytes().unwrap(),
1310                 ) {
1311                     Ok(r) => {
1312                         let r = SetMediaRWDataFileProjectInheritanceFlagReply::parse_from_bytes(&r)
1313                             .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
1314                         if !r.success {
1315                             return Ok(IoctlReply::Done(Err(io::Error::from_raw_os_error(
1316                                 r.error,
1317                             ))));
1318                         }
1319                     }
1320                     Err(e) => {
1321                         return Err(io::Error::new(io::ErrorKind::Other, e));
1322                     }
1323                 };
1324             }
1325         }
1326 
1327         // Safe because this doesn't modify any memory and we check the return value.
1328         let res = unsafe { ioctl_with_ptr(&*data, FS_IOC_SETFLAGS(), &in_flags) };
1329         if res < 0 {
1330             Ok(IoctlReply::Done(Err(io::Error::last_os_error())))
1331         } else {
1332             Ok(IoctlReply::Done(Ok(Vec::new())))
1333         }
1334     }
1335 
enable_verity<R: io::Read>( &self, inode: Inode, handle: Handle, mut r: R, ) -> io::Result<IoctlReply>1336     fn enable_verity<R: io::Read>(
1337         &self,
1338         inode: Inode,
1339         handle: Handle,
1340         mut r: R,
1341     ) -> io::Result<IoctlReply> {
1342         let inode_data = self.find_inode(inode)?;
1343 
1344         // These match the return codes from `fsverity_ioctl_enable` in the kernel.
1345         match inode_data.filetype {
1346             FileType::Regular => {}
1347             FileType::Directory => return Err(io::Error::from_raw_os_error(libc::EISDIR)),
1348             FileType::Other => return Err(io::Error::from_raw_os_error(libc::EINVAL)),
1349         }
1350 
1351         {
1352             // We cannot enable verity while holding a writable fd so get a new one, if necessary.
1353             let mut file = inode_data.file.lock();
1354             let mut flags = file.1;
1355             match flags & libc::O_ACCMODE {
1356                 libc::O_WRONLY | libc::O_RDWR => {
1357                     flags &= !libc::O_ACCMODE;
1358                     flags |= libc::O_RDONLY;
1359 
1360                     // We need to get a read-only handle for this file.
1361                     let newfile = self.open_fd(file.0.as_raw_descriptor(), libc::O_RDONLY)?;
1362                     *file = (newfile, flags);
1363                 }
1364                 libc::O_RDONLY => {}
1365                 _ => panic!("Unexpected flags: {:#x}", flags),
1366             }
1367         }
1368 
1369         let data: Arc<dyn AsRawDescriptor> = if self.zero_message_open.load(Ordering::Relaxed) {
1370             inode_data
1371         } else {
1372             let data = self.find_handle(handle, inode)?;
1373 
1374             {
1375                 // We can't enable verity while holding a writable fd. We don't know whether the file
1376                 // was opened for writing so check it here. We don't expect this to be a frequent
1377                 // operation so the extra latency should be fine.
1378                 let mut file = data.file.lock();
1379                 let flags = FileFlags::from_file(&*file).map_err(io::Error::from)?;
1380                 match flags {
1381                     FileFlags::ReadWrite | FileFlags::Write => {
1382                         // We need to get a read-only handle for this file.
1383                         *file = self.open_fd(file.as_raw_descriptor(), libc::O_RDONLY)?;
1384                     }
1385                     FileFlags::Read => {}
1386                 }
1387             }
1388 
1389             data
1390         };
1391 
1392         let mut arg: fsverity_enable_arg = zerocopy_from_reader(&mut r)?;
1393 
1394         let mut salt;
1395         if arg.salt_size > 0 {
1396             if arg.salt_size > self.max_buffer_size() {
1397                 return Ok(IoctlReply::Done(Err(io::Error::from_raw_os_error(
1398                     libc::ENOMEM,
1399                 ))));
1400             }
1401             salt = vec![0; arg.salt_size as usize];
1402             r.read_exact(&mut salt)?;
1403             arg.salt_ptr = salt.as_ptr() as usize as u64;
1404         } else {
1405             arg.salt_ptr = 0;
1406         }
1407 
1408         let mut sig;
1409         if arg.sig_size > 0 {
1410             if arg.sig_size > self.max_buffer_size() {
1411                 return Ok(IoctlReply::Done(Err(io::Error::from_raw_os_error(
1412                     libc::ENOMEM,
1413                 ))));
1414             }
1415             sig = vec![0; arg.sig_size as usize];
1416             r.read_exact(&mut sig)?;
1417             arg.sig_ptr = sig.as_ptr() as usize as u64;
1418         } else {
1419             arg.sig_ptr = 0;
1420         }
1421 
1422         // Safe because this doesn't modify any memory and we check the return value.
1423         let res = unsafe { ioctl_with_ptr(&*data, FS_IOC_ENABLE_VERITY(), &arg) };
1424         if res < 0 {
1425             Ok(IoctlReply::Done(Err(io::Error::last_os_error())))
1426         } else {
1427             Ok(IoctlReply::Done(Ok(Vec::new())))
1428         }
1429     }
1430 
measure_verity<R: io::Read>( &self, inode: Inode, handle: Handle, r: R, out_size: u32, ) -> io::Result<IoctlReply>1431     fn measure_verity<R: io::Read>(
1432         &self,
1433         inode: Inode,
1434         handle: Handle,
1435         r: R,
1436         out_size: u32,
1437     ) -> io::Result<IoctlReply> {
1438         let data: Arc<dyn AsRawDescriptor> = if self.zero_message_open.load(Ordering::Relaxed) {
1439             self.find_inode(inode)?
1440         } else {
1441             self.find_handle(handle, inode)?
1442         };
1443 
1444         let digest: fsverity_digest = zerocopy_from_reader(r)?;
1445 
1446         // Taken from fs/verity/fsverity_private.h.
1447         const FS_VERITY_MAX_DIGEST_SIZE: u16 = 64;
1448 
1449         // This digest size is what the fsverity command line utility uses.
1450         const DIGEST_SIZE: u16 = FS_VERITY_MAX_DIGEST_SIZE * 2 + 1;
1451         const BUFLEN: usize = size_of::<fsverity_digest>() + DIGEST_SIZE as usize;
1452         const ROUNDED_LEN: usize =
1453             (BUFLEN + size_of::<fsverity_digest>() - 1) / size_of::<fsverity_digest>();
1454 
1455         // Make sure we get a properly aligned allocation.
1456         let mut buf = [MaybeUninit::<fsverity_digest>::uninit(); ROUNDED_LEN];
1457 
1458         // Safe because we are only writing data and not reading uninitialized memory.
1459         unsafe {
1460             // TODO: Replace with `MaybeUninit::slice_as_mut_ptr` once it is stabilized.
1461             addr_of_mut!((*(buf.as_mut_ptr() as *mut fsverity_digest)).digest_size)
1462                 .write(DIGEST_SIZE)
1463         };
1464 
1465         // Safe because this will only modify `buf` and we check the return value.
1466         let res = unsafe { ioctl_with_mut_ptr(&*data, FS_IOC_MEASURE_VERITY(), buf.as_mut_ptr()) };
1467         if res < 0 {
1468             Ok(IoctlReply::Done(Err(io::Error::last_os_error())))
1469         } else {
1470             // Safe because this value was initialized by us already and then overwritten by the
1471             // kernel.
1472             // TODO: Replace with `MaybeUninit::slice_as_ptr` once it is stabilized.
1473             let digest_size =
1474                 unsafe { addr_of!((*(buf.as_ptr() as *const fsverity_digest)).digest_size).read() };
1475             let outlen = size_of::<fsverity_digest>() as u32 + u32::from(digest_size);
1476 
1477             // The kernel guarantees this but it doesn't hurt to be paranoid.
1478             debug_assert!(outlen <= (ROUNDED_LEN * size_of::<fsverity_digest>()) as u32);
1479             if digest.digest_size < digest_size || out_size < outlen {
1480                 return Ok(IoctlReply::Done(Err(io::Error::from_raw_os_error(
1481                     libc::EOVERFLOW,
1482                 ))));
1483             }
1484 
1485             // Safe because any bit pattern is valid for `MaybeUninit<u8>` and `fsverity_digest`
1486             // doesn't contain any references.
1487             let buf: [MaybeUninit<u8>; ROUNDED_LEN * size_of::<fsverity_digest>()] =
1488                 unsafe { mem::transmute(buf) };
1489 
1490             // Casting to `*const [u8]` is safe because the kernel guarantees that the first
1491             // `outlen` bytes of `buf` are initialized and `MaybeUninit<u8>` is guaranteed to have
1492             // the same layout as `u8`.
1493             // TODO: Replace with `MaybeUninit::slice_assume_init_ref` once it is stabilized.
1494             let buf =
1495                 unsafe { &*(&buf[..outlen as usize] as *const [MaybeUninit<u8>] as *const [u8]) };
1496             Ok(IoctlReply::Done(Ok(buf.to_vec())))
1497         }
1498     }
1499 }
1500 
forget_one( inodes: &mut MultikeyBTreeMap<Inode, InodeAltKey, Arc<InodeData>>, inode: Inode, count: u64, )1501 fn forget_one(
1502     inodes: &mut MultikeyBTreeMap<Inode, InodeAltKey, Arc<InodeData>>,
1503     inode: Inode,
1504     count: u64,
1505 ) {
1506     if let Some(data) = inodes.get(&inode) {
1507         // Acquiring the write lock on the inode map prevents new lookups from incrementing the
1508         // refcount but there is the possibility that a previous lookup already acquired a
1509         // reference to the inode data and is in the process of updating the refcount so we need
1510         // to loop here until we can decrement successfully.
1511         loop {
1512             let refcount = data.refcount.load(Ordering::Relaxed);
1513 
1514             // Saturating sub because it doesn't make sense for a refcount to go below zero and
1515             // we don't want misbehaving clients to cause integer overflow.
1516             let new_count = refcount.saturating_sub(count);
1517 
1518             // Synchronizes with the acquire load in `do_lookup`.
1519             if data
1520                 .refcount
1521                 .compare_exchange_weak(refcount, new_count, Ordering::Release, Ordering::Relaxed)
1522                 .is_ok()
1523             {
1524                 if new_count == 0 {
1525                     // We just removed the last refcount for this inode. There's no need for an
1526                     // acquire fence here because we hold a write lock on the inode map and any
1527                     // thread that is waiting to do a forget on the same inode will have to wait
1528                     // until we release the lock. So there's is no other release store for us to
1529                     // synchronize with before deleting the entry.
1530                     inodes.remove(&inode);
1531                 }
1532                 break;
1533             }
1534         }
1535     }
1536 }
1537 
1538 // Strips any `user.virtiofs.` prefix from `buf`. If buf contains one or more nul-bytes, each
1539 // nul-byte-separated slice is treated as a C string and the prefix is stripped from each one.
strip_xattr_prefix(buf: &mut Vec<u8>)1540 fn strip_xattr_prefix(buf: &mut Vec<u8>) {
1541     fn next_cstr(b: &[u8], start: usize) -> Option<&[u8]> {
1542         if start >= b.len() {
1543             return None;
1544         }
1545 
1546         let end = b[start..]
1547             .iter()
1548             .position(|&c| c == b'\0')
1549             .map(|p| start + p + 1)
1550             .unwrap_or(b.len());
1551 
1552         Some(&b[start..end])
1553     }
1554 
1555     let mut pos = 0;
1556     while let Some(name) = next_cstr(buf, pos) {
1557         if !name.starts_with(USER_VIRTIOFS_XATTR) {
1558             pos += name.len();
1559             continue;
1560         }
1561 
1562         let newlen = name.len() - USER_VIRTIOFS_XATTR.len();
1563         buf.drain(pos..pos + USER_VIRTIOFS_XATTR.len());
1564         pos += newlen;
1565     }
1566 }
1567 
1568 impl FileSystem for PassthroughFs {
1569     type Inode = Inode;
1570     type Handle = Handle;
1571     type DirIter = ReadDir<Box<[u8]>>;
1572 
init(&self, capable: FsOptions) -> io::Result<FsOptions>1573     fn init(&self, capable: FsOptions) -> io::Result<FsOptions> {
1574         // Safe because this is a constant value and a valid C string.
1575         let root = unsafe { CStr::from_bytes_with_nul_unchecked(ROOT_CSTR) };
1576 
1577         let flags = libc::O_DIRECTORY | libc::O_NOFOLLOW | libc::O_CLOEXEC;
1578         // Safe because this doesn't modify any memory and we check the return value.
1579         let raw_descriptor = unsafe { libc::openat64(libc::AT_FDCWD, root.as_ptr(), flags) };
1580         if raw_descriptor < 0 {
1581             return Err(io::Error::last_os_error());
1582         }
1583 
1584         // Safe because we just opened this descriptor above.
1585         let f = unsafe { File::from_raw_descriptor(raw_descriptor) };
1586 
1587         let st = stat(&f)?;
1588 
1589         // Safe because this doesn't modify any memory and there is no need to check the return
1590         // value because this system call always succeeds. We need to clear the umask here because
1591         // we want the client to be able to set all the bits in the mode.
1592         unsafe { libc::umask(0o000) };
1593 
1594         let mut inodes = self.inodes.lock();
1595 
1596         // Not sure why the root inode gets a refcount of 2 but that's what libfuse does.
1597         inodes.insert(
1598             ROOT_ID,
1599             InodeAltKey {
1600                 ino: st.st_ino,
1601                 dev: st.st_dev,
1602             },
1603             Arc::new(InodeData {
1604                 inode: ROOT_ID,
1605                 file: Mutex::new((f, flags)),
1606                 refcount: AtomicU64::new(2),
1607                 filetype: st.st_mode.into(),
1608             }),
1609         );
1610 
1611         let mut opts = FsOptions::DO_READDIRPLUS
1612             | FsOptions::READDIRPLUS_AUTO
1613             | FsOptions::EXPORT_SUPPORT
1614             | FsOptions::DONT_MASK;
1615         if self.cfg.posix_acl {
1616             opts |= FsOptions::POSIX_ACL;
1617         }
1618         if self.cfg.writeback && capable.contains(FsOptions::WRITEBACK_CACHE) {
1619             opts |= FsOptions::WRITEBACK_CACHE;
1620             self.writeback.store(true, Ordering::Relaxed);
1621         }
1622         if self.cfg.cache_policy == CachePolicy::Always {
1623             if capable.contains(FsOptions::ZERO_MESSAGE_OPEN) {
1624                 opts |= FsOptions::ZERO_MESSAGE_OPEN;
1625                 self.zero_message_open.store(true, Ordering::Relaxed);
1626             }
1627             if capable.contains(FsOptions::ZERO_MESSAGE_OPENDIR) {
1628                 opts |= FsOptions::ZERO_MESSAGE_OPENDIR;
1629                 self.zero_message_opendir.store(true, Ordering::Relaxed);
1630             }
1631         }
1632         Ok(opts)
1633     }
1634 
destroy(&self)1635     fn destroy(&self) {
1636         cros_tracing::trace_simple_print!("{:?}: destroy", self);
1637         self.handles.lock().clear();
1638         self.inodes.lock().clear();
1639     }
1640 
statfs(&self, _ctx: Context, inode: Inode) -> io::Result<libc::statvfs64>1641     fn statfs(&self, _ctx: Context, inode: Inode) -> io::Result<libc::statvfs64> {
1642         cros_tracing::trace_simple_print!("{}: statfs: inode={inode}", self.tag);
1643         let data = self.find_inode(inode)?;
1644 
1645         let mut out = MaybeUninit::<libc::statvfs64>::zeroed();
1646 
1647         // Safe because this will only modify `out` and we check the return value.
1648         syscall!(unsafe { libc::fstatvfs64(data.as_raw_descriptor(), out.as_mut_ptr()) })?;
1649 
1650         // Safe because the kernel guarantees that `out` has been initialized.
1651         Ok(unsafe { out.assume_init() })
1652     }
1653 
lookup(&self, _ctx: Context, parent: Inode, name: &CStr) -> io::Result<Entry>1654     fn lookup(&self, _ctx: Context, parent: Inode, name: &CStr) -> io::Result<Entry> {
1655         cros_tracing::trace_simple_print!(
1656             "{}: lookup: inode={}, name={:?}",
1657             self.tag,
1658             parent,
1659             name
1660         );
1661         let data = self.find_inode(parent)?;
1662         self.do_lookup(&data, name).or_else(|e| {
1663             if self.cfg.ascii_casefold {
1664                 self.ascii_casefold_lookup(&data, name.to_bytes())
1665             } else {
1666                 Err(e)
1667             }
1668         })
1669     }
1670 
forget(&self, _ctx: Context, inode: Inode, count: u64)1671     fn forget(&self, _ctx: Context, inode: Inode, count: u64) {
1672         cros_tracing::trace_simple_print!("{}: forget: inode={inode}, count={count}", self.tag);
1673         let mut inodes = self.inodes.lock();
1674 
1675         forget_one(&mut inodes, inode, count)
1676     }
1677 
batch_forget(&self, _ctx: Context, requests: Vec<(Inode, u64)>)1678     fn batch_forget(&self, _ctx: Context, requests: Vec<(Inode, u64)>) {
1679         let mut inodes = self.inodes.lock();
1680 
1681         for (inode, count) in requests {
1682             forget_one(&mut inodes, inode, count)
1683         }
1684     }
1685 
opendir( &self, _ctx: Context, inode: Inode, flags: u32, ) -> io::Result<(Option<Handle>, OpenOptions)>1686     fn opendir(
1687         &self,
1688         _ctx: Context,
1689         inode: Inode,
1690         flags: u32,
1691     ) -> io::Result<(Option<Handle>, OpenOptions)> {
1692         cros_tracing::trace_simple_print!("{}: opendir: inode={inode}, flags={flags}", self.tag);
1693         if self.zero_message_opendir.load(Ordering::Relaxed) {
1694             Err(io::Error::from_raw_os_error(libc::ENOSYS))
1695         } else {
1696             self.do_open(inode, flags | (libc::O_DIRECTORY as u32))
1697         }
1698     }
1699 
releasedir( &self, _ctx: Context, inode: Inode, _flags: u32, handle: Handle, ) -> io::Result<()>1700     fn releasedir(
1701         &self,
1702         _ctx: Context,
1703         inode: Inode,
1704         _flags: u32,
1705         handle: Handle,
1706     ) -> io::Result<()> {
1707         cros_tracing::trace_simple_print!(
1708             "{}: releasedir: inode={inode}, handle={handle}",
1709             self.tag
1710         );
1711         if self.zero_message_opendir.load(Ordering::Relaxed) {
1712             Ok(())
1713         } else {
1714             self.do_release(inode, handle)
1715         }
1716     }
1717 
mkdir( &self, ctx: Context, parent: Inode, name: &CStr, mode: u32, umask: u32, ) -> io::Result<Entry>1718     fn mkdir(
1719         &self,
1720         ctx: Context,
1721         parent: Inode,
1722         name: &CStr,
1723         mode: u32,
1724         umask: u32,
1725     ) -> io::Result<Entry> {
1726         cros_tracing::trace_simple_print!(
1727             "{}: mkdir: inode={parent}, name={:?}, mode={mode}, umask={umask}",
1728             self.tag,
1729             name
1730         );
1731         let data = self.find_inode(parent)?;
1732 
1733         let (_uid, _gid) = set_creds(ctx.uid, ctx.gid)?;
1734         {
1735             let _scoped_umask = ScopedUmask::new(umask);
1736 
1737             // Safe because this doesn't modify any memory and we check the return value.
1738             syscall!(unsafe { libc::mkdirat(data.as_raw_descriptor(), name.as_ptr(), mode) })?;
1739         }
1740 
1741         self.do_lookup(&data, name)
1742     }
1743 
rmdir(&self, _ctx: Context, parent: Inode, name: &CStr) -> io::Result<()>1744     fn rmdir(&self, _ctx: Context, parent: Inode, name: &CStr) -> io::Result<()> {
1745         cros_tracing::trace_simple_print!("{}: rmdir: inode={parent}, name={:?}", self.tag, name);
1746         let data = self.find_inode(parent)?;
1747         self.do_unlink(&data, name, libc::AT_REMOVEDIR)
1748     }
1749 
readdir( &self, _ctx: Context, inode: Inode, handle: Handle, size: u32, offset: u64, ) -> io::Result<Self::DirIter>1750     fn readdir(
1751         &self,
1752         _ctx: Context,
1753         inode: Inode,
1754         handle: Handle,
1755         size: u32,
1756         offset: u64,
1757     ) -> io::Result<Self::DirIter> {
1758         cros_tracing::trace_simple_print!(
1759             "{}: readdir: inode={inode}, handle={handle}, size={size}, offset={offset}",
1760             self.tag
1761         );
1762         let buf = vec![0; size as usize].into_boxed_slice();
1763 
1764         if self.zero_message_opendir.load(Ordering::Relaxed) {
1765             let data = self.find_inode(inode)?;
1766             ReadDir::new(&*data, offset as libc::off64_t, buf)
1767         } else {
1768             let data = self.find_handle(handle, inode)?;
1769 
1770             let dir = data.file.lock();
1771 
1772             ReadDir::new(&*dir, offset as libc::off64_t, buf)
1773         }
1774     }
1775 
open( &self, _ctx: Context, inode: Inode, flags: u32, ) -> io::Result<(Option<Handle>, OpenOptions)>1776     fn open(
1777         &self,
1778         _ctx: Context,
1779         inode: Inode,
1780         flags: u32,
1781     ) -> io::Result<(Option<Handle>, OpenOptions)> {
1782         if self.zero_message_open.load(Ordering::Relaxed) {
1783             cros_tracing::trace_simple_print!(
1784                 "{}: open (zero-message): inode={inode}, flags={flags}",
1785                 self.tag
1786             );
1787             Err(io::Error::from_raw_os_error(libc::ENOSYS))
1788         } else {
1789             cros_tracing::trace_simple_print!("{}: open: inode={inode}, flags={flags}", self.tag);
1790             self.do_open(inode, flags)
1791         }
1792     }
1793 
release( &self, _ctx: Context, inode: Inode, _flags: u32, handle: Handle, _flush: bool, _flock_release: bool, _lock_owner: Option<u64>, ) -> io::Result<()>1794     fn release(
1795         &self,
1796         _ctx: Context,
1797         inode: Inode,
1798         _flags: u32,
1799         handle: Handle,
1800         _flush: bool,
1801         _flock_release: bool,
1802         _lock_owner: Option<u64>,
1803     ) -> io::Result<()> {
1804         if self.zero_message_open.load(Ordering::Relaxed) {
1805             cros_tracing::trace_simple_print!(
1806                 "{}: release (zero-message): inode={inode}, handle={handle}",
1807                 self.tag
1808             );
1809             Ok(())
1810         } else {
1811             cros_tracing::trace_simple_print!(
1812                 "{}: release: inode={inode}, handle={handle}",
1813                 self.tag
1814             );
1815             self.do_release(inode, handle)
1816         }
1817     }
1818 
chromeos_tmpfile( &self, ctx: Context, parent: Self::Inode, mode: u32, umask: u32, ) -> io::Result<Entry>1819     fn chromeos_tmpfile(
1820         &self,
1821         ctx: Context,
1822         parent: Self::Inode,
1823         mode: u32,
1824         umask: u32,
1825     ) -> io::Result<Entry> {
1826         cros_tracing::trace_simple_print!(
1827             "{}: chromeos_tempfile: inode={parent}, mode={mode}, umask={umask}",
1828             self.tag
1829         );
1830         let data = self.find_inode(parent)?;
1831 
1832         let (_uid, _gid) = set_creds(ctx.uid, ctx.gid)?;
1833 
1834         let tmpflags = libc::O_RDWR | libc::O_TMPFILE | libc::O_CLOEXEC | libc::O_NOFOLLOW;
1835 
1836         // Safe because this is a valid c string.
1837         let current_dir = unsafe { CStr::from_bytes_with_nul_unchecked(b".\0") };
1838 
1839         let fd = {
1840             let _scoped_umask = ScopedUmask::new(umask);
1841 
1842             // Safe because this doesn't modify any memory and we check the return value.
1843             syscall!(unsafe {
1844                 libc::openat64(
1845                     data.as_raw_descriptor(),
1846                     current_dir.as_ptr(),
1847                     tmpflags,
1848                     mode,
1849                 )
1850             })?
1851         };
1852 
1853         // Safe because we just opened this fd.
1854         let tmpfile = unsafe { File::from_raw_descriptor(fd) };
1855 
1856         let st = stat(&tmpfile)?;
1857         Ok(self.add_entry(tmpfile, st, tmpflags))
1858     }
1859 
create( &self, ctx: Context, parent: Inode, name: &CStr, mode: u32, flags: u32, umask: u32, ) -> io::Result<(Entry, Option<Handle>, OpenOptions)>1860     fn create(
1861         &self,
1862         ctx: Context,
1863         parent: Inode,
1864         name: &CStr,
1865         mode: u32,
1866         flags: u32,
1867         umask: u32,
1868     ) -> io::Result<(Entry, Option<Handle>, OpenOptions)> {
1869         cros_tracing::trace_simple_print!(
1870             "{}: create: inode={parent}, name={:?}, mode={mode}, flags={flags}, umask={umask}",
1871             self.tag,
1872             name
1873         );
1874         let data = self.find_inode(parent)?;
1875 
1876         let (_uid, _gid) = set_creds(ctx.uid, ctx.gid)?;
1877 
1878         let create_flags =
1879             (flags as i32 | libc::O_CREAT | libc::O_CLOEXEC | libc::O_NOFOLLOW) & !libc::O_DIRECT;
1880 
1881         let fd = {
1882             let _scoped_umask = ScopedUmask::new(umask);
1883 
1884             // Safe because this doesn't modify any memory and we check the return value. We don't
1885             // really check `flags` because if the kernel can't handle poorly specified flags then
1886             // we have much bigger problems.
1887             syscall!(unsafe {
1888                 libc::openat64(data.as_raw_descriptor(), name.as_ptr(), create_flags, mode)
1889             })?
1890         };
1891 
1892         // Safe because we just opened this fd.
1893         let file = unsafe { File::from_raw_descriptor(fd) };
1894 
1895         let st = stat(&file)?;
1896         let entry = self.add_entry(file, st, create_flags);
1897 
1898         let (handle, opts) = if self.zero_message_open.load(Ordering::Relaxed) {
1899             (None, OpenOptions::KEEP_CACHE)
1900         } else {
1901             self.do_open(
1902                 entry.inode,
1903                 flags & !((libc::O_CREAT | libc::O_EXCL | libc::O_NOCTTY) as u32),
1904             )
1905             .map_err(|e| {
1906                 // Don't leak the entry.
1907                 self.forget(ctx, entry.inode, 1);
1908                 e
1909             })?
1910         };
1911 
1912         Ok((entry, handle, opts))
1913     }
1914 
unlink(&self, _ctx: Context, parent: Inode, name: &CStr) -> io::Result<()>1915     fn unlink(&self, _ctx: Context, parent: Inode, name: &CStr) -> io::Result<()> {
1916         cros_tracing::trace_simple_print!("{}: unlink: inode={parent}, name={:?}", self.tag, name);
1917         let data = self.find_inode(parent)?;
1918         self.do_unlink(&data, name, 0)
1919     }
1920 
read<W: io::Write + ZeroCopyWriter>( &self, _ctx: Context, inode: Inode, handle: Handle, mut w: W, size: u32, offset: u64, _lock_owner: Option<u64>, _flags: u32, ) -> io::Result<usize>1921     fn read<W: io::Write + ZeroCopyWriter>(
1922         &self,
1923         _ctx: Context,
1924         inode: Inode,
1925         handle: Handle,
1926         mut w: W,
1927         size: u32,
1928         offset: u64,
1929         _lock_owner: Option<u64>,
1930         _flags: u32,
1931     ) -> io::Result<usize> {
1932         if self.zero_message_open.load(Ordering::Relaxed) {
1933             cros_tracing::trace_simple_print!("{}: read (zero-message): inode={inode}, handle={handle}, size={size}, offset={offset}", self.tag);
1934             let data = self.find_inode(inode)?;
1935 
1936             let mut file = data.file.lock();
1937             let mut flags = file.1;
1938             match flags & libc::O_ACCMODE {
1939                 libc::O_WRONLY => {
1940                     flags &= !libc::O_WRONLY;
1941                     flags |= libc::O_RDWR;
1942 
1943                     // We need to get a readable handle for this file.
1944                     let newfile = self.open_fd(file.0.as_raw_descriptor(), libc::O_RDWR)?;
1945                     *file = (newfile, flags);
1946                 }
1947                 libc::O_RDONLY | libc::O_RDWR => {}
1948                 _ => panic!("Unexpected flags: {:#x}", flags),
1949             }
1950 
1951             w.write_from(&mut file.0, size as usize, offset)
1952         } else {
1953             cros_tracing::trace_simple_print!(
1954                 "{}: read: inode={inode}, handle={handle}, size={size}, offset={offset}",
1955                 self.tag
1956             );
1957             let data = self.find_handle(handle, inode)?;
1958 
1959             let mut f = data.file.lock();
1960             w.write_from(&mut f, size as usize, offset)
1961         }
1962     }
1963 
write<R: io::Read + ZeroCopyReader>( &self, _ctx: Context, inode: Inode, handle: Handle, mut r: R, size: u32, offset: u64, _lock_owner: Option<u64>, _delayed_write: bool, flags: u32, ) -> io::Result<usize>1964     fn write<R: io::Read + ZeroCopyReader>(
1965         &self,
1966         _ctx: Context,
1967         inode: Inode,
1968         handle: Handle,
1969         mut r: R,
1970         size: u32,
1971         offset: u64,
1972         _lock_owner: Option<u64>,
1973         _delayed_write: bool,
1974         flags: u32,
1975     ) -> io::Result<usize> {
1976         // When the WRITE_KILL_PRIV flag is set, drop CAP_FSETID so that the kernel will
1977         // automatically clear the setuid and setgid bits for us.
1978         let _fsetid = if flags & WRITE_KILL_PRIV != 0 {
1979             Some(drop_cap_fsetid()?)
1980         } else {
1981             None
1982         };
1983 
1984         if self.zero_message_open.load(Ordering::Relaxed) {
1985             cros_tracing::trace_simple_print!(
1986                 "{}: write (zero-message): inode={inode}, handle={handle}, size={size}, offset={offset}",
1987                 self.tag
1988             );
1989 
1990             let data = self.find_inode(inode)?;
1991 
1992             let mut file = data.file.lock();
1993             let mut flags = file.1;
1994             match flags & libc::O_ACCMODE {
1995                 libc::O_RDONLY => {
1996                     flags &= !libc::O_RDONLY;
1997                     flags |= libc::O_RDWR;
1998 
1999                     // We need to get a writable handle for this file.
2000                     let newfile = self.open_fd(file.0.as_raw_descriptor(), libc::O_RDWR)?;
2001                     *file = (newfile, flags);
2002                 }
2003                 libc::O_WRONLY | libc::O_RDWR => {}
2004                 _ => panic!("Unexpected flags: {:#x}", flags),
2005             }
2006 
2007             r.read_to(&mut file.0, size as usize, offset)
2008         } else {
2009             cros_tracing::trace_simple_print!(
2010                 "{}: write: inode={inode}, handle={handle}, size={size}, offset={offset}",
2011                 self.tag
2012             );
2013 
2014             let data = self.find_handle(handle, inode)?;
2015 
2016             let mut f = data.file.lock();
2017             r.read_to(&mut f, size as usize, offset)
2018         }
2019     }
2020 
getattr( &self, _ctx: Context, inode: Inode, _handle: Option<Handle>, ) -> io::Result<(libc::stat64, Duration)>2021     fn getattr(
2022         &self,
2023         _ctx: Context,
2024         inode: Inode,
2025         _handle: Option<Handle>,
2026     ) -> io::Result<(libc::stat64, Duration)> {
2027         cros_tracing::trace_simple_print!("{}: getattr: inode={inode}", self.tag);
2028 
2029         let data = self.find_inode(inode)?;
2030         self.do_getattr(&data)
2031     }
2032 
setattr( &self, _ctx: Context, inode: Inode, attr: libc::stat64, handle: Option<Handle>, valid: SetattrValid, ) -> io::Result<(libc::stat64, Duration)>2033     fn setattr(
2034         &self,
2035         _ctx: Context,
2036         inode: Inode,
2037         attr: libc::stat64,
2038         handle: Option<Handle>,
2039         valid: SetattrValid,
2040     ) -> io::Result<(libc::stat64, Duration)> {
2041         cros_tracing::trace_simple_print!(
2042             "{}: setattr: inode={inode}, handle={:?}",
2043             self.tag,
2044             handle
2045         );
2046         let inode_data = self.find_inode(inode)?;
2047 
2048         enum Data {
2049             Handle(Arc<HandleData>, RawDescriptor),
2050             ProcPath(CString),
2051         }
2052 
2053         // If we have a handle then use it otherwise get a new fd from the inode.
2054         let data = if let Some(handle) = handle.filter(|&h| h != 0) {
2055             let hd = self.find_handle(handle, inode)?;
2056 
2057             let fd = hd.file.lock().as_raw_descriptor();
2058             Data::Handle(hd, fd)
2059         } else {
2060             let pathname = CString::new(format!("self/fd/{}", inode_data.as_raw_descriptor()))
2061                 .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
2062             Data::ProcPath(pathname)
2063         };
2064 
2065         if valid.contains(SetattrValid::MODE) {
2066             // Safe because this doesn't modify any memory and we check the return value.
2067             syscall!(unsafe {
2068                 match data {
2069                     Data::Handle(_, fd) => libc::fchmod(fd, attr.st_mode),
2070                     Data::ProcPath(ref p) => {
2071                         libc::fchmodat(self.proc.as_raw_descriptor(), p.as_ptr(), attr.st_mode, 0)
2072                     }
2073                 }
2074             })?;
2075         }
2076 
2077         if valid.intersects(SetattrValid::UID | SetattrValid::GID) {
2078             let uid = if valid.contains(SetattrValid::UID) {
2079                 attr.st_uid
2080             } else {
2081                 // Cannot use -1 here because these are unsigned values.
2082                 ::std::u32::MAX
2083             };
2084             let gid = if valid.contains(SetattrValid::GID) {
2085                 attr.st_gid
2086             } else {
2087                 // Cannot use -1 here because these are unsigned values.
2088                 ::std::u32::MAX
2089             };
2090 
2091             // Safe because this is a constant value and a valid C string.
2092             let empty = unsafe { CStr::from_bytes_with_nul_unchecked(EMPTY_CSTR) };
2093 
2094             // Safe because this doesn't modify any memory and we check the return value.
2095             syscall!(unsafe {
2096                 libc::fchownat(
2097                     inode_data.as_raw_descriptor(),
2098                     empty.as_ptr(),
2099                     uid,
2100                     gid,
2101                     libc::AT_EMPTY_PATH | libc::AT_SYMLINK_NOFOLLOW,
2102                 )
2103             })?;
2104         }
2105 
2106         if valid.contains(SetattrValid::SIZE) {
2107             // Safe because this doesn't modify any memory and we check the return value.
2108             syscall!(match data {
2109                 Data::Handle(_, fd) => unsafe { libc::ftruncate64(fd, attr.st_size) },
2110                 _ => {
2111                     // There is no `ftruncateat` so we need to get a new fd and truncate it.
2112                     let f = self.open_inode(&inode_data, libc::O_NONBLOCK | libc::O_RDWR)?;
2113                     unsafe { libc::ftruncate64(f.as_raw_descriptor(), attr.st_size) }
2114                 }
2115             })?;
2116         }
2117 
2118         if valid.intersects(SetattrValid::ATIME | SetattrValid::MTIME) {
2119             let mut tvs = [
2120                 libc::timespec {
2121                     tv_sec: 0,
2122                     tv_nsec: libc::UTIME_OMIT,
2123                 },
2124                 libc::timespec {
2125                     tv_sec: 0,
2126                     tv_nsec: libc::UTIME_OMIT,
2127                 },
2128             ];
2129 
2130             if valid.contains(SetattrValid::ATIME_NOW) {
2131                 tvs[0].tv_nsec = libc::UTIME_NOW;
2132             } else if valid.contains(SetattrValid::ATIME) {
2133                 tvs[0].tv_sec = attr.st_atime;
2134                 tvs[0].tv_nsec = attr.st_atime_nsec;
2135             }
2136 
2137             if valid.contains(SetattrValid::MTIME_NOW) {
2138                 tvs[1].tv_nsec = libc::UTIME_NOW;
2139             } else if valid.contains(SetattrValid::MTIME) {
2140                 tvs[1].tv_sec = attr.st_mtime;
2141                 tvs[1].tv_nsec = attr.st_mtime_nsec;
2142             }
2143 
2144             // Safe because this doesn't modify any memory and we check the return value.
2145             syscall!(unsafe {
2146                 match data {
2147                     Data::Handle(_, fd) => libc::futimens(fd, tvs.as_ptr()),
2148                     Data::ProcPath(ref p) => {
2149                         libc::utimensat(self.proc.as_raw_descriptor(), p.as_ptr(), tvs.as_ptr(), 0)
2150                     }
2151                 }
2152             })?;
2153         }
2154 
2155         self.do_getattr(&inode_data)
2156     }
2157 
rename( &self, _ctx: Context, olddir: Inode, oldname: &CStr, newdir: Inode, newname: &CStr, flags: u32, ) -> io::Result<()>2158     fn rename(
2159         &self,
2160         _ctx: Context,
2161         olddir: Inode,
2162         oldname: &CStr,
2163         newdir: Inode,
2164         newname: &CStr,
2165         flags: u32,
2166     ) -> io::Result<()> {
2167         cros_tracing::trace_simple_print!(
2168             "{}: rename: olddir={olddir}, oldname={:?}, newdir={newdir}, newname={:?}, flags={flags}",
2169             self.tag,
2170             oldname,
2171             newname
2172         );
2173 
2174         let old_inode = self.find_inode(olddir)?;
2175         let new_inode = self.find_inode(newdir)?;
2176 
2177         // Safe because this doesn't modify any memory and we check the return value.
2178         // TODO: Switch to libc::renameat2 once https://github.com/rust-lang/libc/pull/1508 lands
2179         // and we have glibc 2.28.
2180         syscall!(unsafe {
2181             libc::syscall(
2182                 libc::SYS_renameat2,
2183                 old_inode.as_raw_descriptor(),
2184                 oldname.as_ptr(),
2185                 new_inode.as_raw_descriptor(),
2186                 newname.as_ptr(),
2187                 flags,
2188             )
2189         })?;
2190         Ok(())
2191     }
2192 
mknod( &self, ctx: Context, parent: Inode, name: &CStr, mode: u32, rdev: u32, umask: u32, ) -> io::Result<Entry>2193     fn mknod(
2194         &self,
2195         ctx: Context,
2196         parent: Inode,
2197         name: &CStr,
2198         mode: u32,
2199         rdev: u32,
2200         umask: u32,
2201     ) -> io::Result<Entry> {
2202         cros_tracing::trace_simple_print!(
2203             "{}: mknod: inode={parent}, name={:?}, mode={mode}, rdev={rdev}, umask={umask}",
2204             self.tag,
2205             name
2206         );
2207         let data = self.find_inode(parent)?;
2208 
2209         let (_uid, _gid) = set_creds(ctx.uid, ctx.gid)?;
2210 
2211         {
2212             let _scoped_umask = ScopedUmask::new(umask);
2213 
2214             // Safe because this doesn't modify any memory and we check the return value.
2215             syscall!(unsafe {
2216                 libc::mknodat(
2217                     data.as_raw_descriptor(),
2218                     name.as_ptr(),
2219                     mode as libc::mode_t,
2220                     rdev as libc::dev_t,
2221                 )
2222             })?;
2223         }
2224 
2225         self.do_lookup(&data, name)
2226     }
2227 
link( &self, _ctx: Context, inode: Inode, newparent: Inode, newname: &CStr, ) -> io::Result<Entry>2228     fn link(
2229         &self,
2230         _ctx: Context,
2231         inode: Inode,
2232         newparent: Inode,
2233         newname: &CStr,
2234     ) -> io::Result<Entry> {
2235         cros_tracing::trace_simple_print!(
2236             "{}: link: inode={inode}, newparent={newparent}, newmname={:?}",
2237             self.tag,
2238             newname
2239         );
2240         let data = self.find_inode(inode)?;
2241         let new_inode = self.find_inode(newparent)?;
2242 
2243         let path = CString::new(format!("self/fd/{}", data.as_raw_descriptor()))
2244             .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
2245 
2246         // Safe because this doesn't modify any memory and we check the return value.
2247         syscall!(unsafe {
2248             libc::linkat(
2249                 self.proc.as_raw_descriptor(),
2250                 path.as_ptr(),
2251                 new_inode.as_raw_descriptor(),
2252                 newname.as_ptr(),
2253                 libc::AT_SYMLINK_FOLLOW,
2254             )
2255         })?;
2256 
2257         self.do_lookup(&new_inode, newname)
2258     }
2259 
symlink( &self, ctx: Context, linkname: &CStr, parent: Inode, name: &CStr, ) -> io::Result<Entry>2260     fn symlink(
2261         &self,
2262         ctx: Context,
2263         linkname: &CStr,
2264         parent: Inode,
2265         name: &CStr,
2266     ) -> io::Result<Entry> {
2267         cros_tracing::trace_simple_print!(
2268             "{}: symlink: inode={parent}, linkname={:?}, name={:?}",
2269             self.tag,
2270             linkname,
2271             name
2272         );
2273         let data = self.find_inode(parent)?;
2274 
2275         let (_uid, _gid) = set_creds(ctx.uid, ctx.gid)?;
2276 
2277         // Safe because this doesn't modify any memory and we check the return value.
2278         syscall!(unsafe {
2279             libc::symlinkat(linkname.as_ptr(), data.as_raw_descriptor(), name.as_ptr())
2280         })?;
2281 
2282         self.do_lookup(&data, name)
2283     }
2284 
readlink(&self, _ctx: Context, inode: Inode) -> io::Result<Vec<u8>>2285     fn readlink(&self, _ctx: Context, inode: Inode) -> io::Result<Vec<u8>> {
2286         cros_tracing::trace_simple_print!("{}: readlink: inode={inode}", self.tag);
2287         let data = self.find_inode(inode)?;
2288 
2289         let mut buf = vec![0; libc::PATH_MAX as usize];
2290 
2291         // Safe because this is a constant value and a valid C string.
2292         let empty = unsafe { CStr::from_bytes_with_nul_unchecked(EMPTY_CSTR) };
2293 
2294         // Safe because this will only modify the contents of `buf` and we check the return value.
2295         let res = syscall!(unsafe {
2296             libc::readlinkat(
2297                 data.as_raw_descriptor(),
2298                 empty.as_ptr(),
2299                 buf.as_mut_ptr() as *mut libc::c_char,
2300                 buf.len(),
2301             )
2302         })?;
2303 
2304         buf.resize(res as usize, 0);
2305         Ok(buf)
2306     }
2307 
flush( &self, _ctx: Context, inode: Inode, handle: Handle, _lock_owner: u64, ) -> io::Result<()>2308     fn flush(
2309         &self,
2310         _ctx: Context,
2311         inode: Inode,
2312         handle: Handle,
2313         _lock_owner: u64,
2314     ) -> io::Result<()> {
2315         cros_tracing::trace_simple_print!("{}: flush: inode={inode}, handle={handle}", self.tag);
2316         let data: Arc<dyn AsRawDescriptor> = if self.zero_message_open.load(Ordering::Relaxed) {
2317             self.find_inode(inode)?
2318         } else {
2319             self.find_handle(handle, inode)?
2320         };
2321 
2322         // Since this method is called whenever an fd is closed in the client, we can emulate that
2323         // behavior by doing the same thing (dup-ing the fd and then immediately closing it). Safe
2324         // because this doesn't modify any memory and we check the return values.
2325         unsafe {
2326             let newfd = syscall!(libc::fcntl(
2327                 data.as_raw_descriptor(),
2328                 libc::F_DUPFD_CLOEXEC,
2329                 0
2330             ))?;
2331 
2332             syscall!(libc::close(newfd))?;
2333         }
2334         Ok(())
2335     }
2336 
fsync(&self, _ctx: Context, inode: Inode, datasync: bool, handle: Handle) -> io::Result<()>2337     fn fsync(&self, _ctx: Context, inode: Inode, datasync: bool, handle: Handle) -> io::Result<()> {
2338         if self.zero_message_open.load(Ordering::Relaxed) {
2339             cros_tracing::trace_simple_print!(
2340                 "{}: fsync (zero-message): inode={inode}, datasync={datasync}, handle={handle}",
2341                 self.tag
2342             );
2343             let data = self.find_inode(inode)?;
2344             self.do_fsync(&*data, datasync)
2345         } else {
2346             cros_tracing::trace_simple_print!(
2347                 "{}: fsync: inode={inode}, datasync={datasync}, handle={handle}",
2348                 self.tag
2349             );
2350             let data = self.find_handle(handle, inode)?;
2351 
2352             let file = data.file.lock();
2353             self.do_fsync(&*file, datasync)
2354         }
2355     }
2356 
fsyncdir( &self, _ctx: Context, inode: Inode, datasync: bool, handle: Handle, ) -> io::Result<()>2357     fn fsyncdir(
2358         &self,
2359         _ctx: Context,
2360         inode: Inode,
2361         datasync: bool,
2362         handle: Handle,
2363     ) -> io::Result<()> {
2364         if self.zero_message_opendir.load(Ordering::Relaxed) {
2365             cros_tracing::trace_simple_print!(
2366                 "{}: fsyncdir (zero-message): inode={inode}, datasync={datasync}, handle={handle}",
2367                 self.tag
2368             );
2369             let data = self.find_inode(inode)?;
2370             self.do_fsync(&*data, datasync)
2371         } else {
2372             cros_tracing::trace_simple_print!(
2373                 "{}: fsyncdir: inode={inode}, datasync={datasync}, handle={handle}",
2374                 self.tag
2375             );
2376             let data = self.find_handle(handle, inode)?;
2377 
2378             let file = data.file.lock();
2379             self.do_fsync(&*file, datasync)
2380         }
2381     }
2382 
access(&self, ctx: Context, inode: Inode, mask: u32) -> io::Result<()>2383     fn access(&self, ctx: Context, inode: Inode, mask: u32) -> io::Result<()> {
2384         cros_tracing::trace_simple_print!("{}: access: inode={inode}, mask={mask}", self.tag);
2385         let data = self.find_inode(inode)?;
2386 
2387         let st = stat(&*data)?;
2388         let mode = mask as i32 & (libc::R_OK | libc::W_OK | libc::X_OK);
2389 
2390         if mode == libc::F_OK {
2391             // The file exists since we were able to call `stat(2)` on it.
2392             return Ok(());
2393         }
2394 
2395         if (mode & libc::R_OK) != 0 {
2396             if ctx.uid != 0
2397                 && (st.st_uid != ctx.uid || st.st_mode & 0o400 == 0)
2398                 && (st.st_gid != ctx.gid || st.st_mode & 0o040 == 0)
2399                 && st.st_mode & 0o004 == 0
2400             {
2401                 return Err(io::Error::from_raw_os_error(libc::EACCES));
2402             }
2403         }
2404 
2405         if (mode & libc::W_OK) != 0 {
2406             if ctx.uid != 0
2407                 && (st.st_uid != ctx.uid || st.st_mode & 0o200 == 0)
2408                 && (st.st_gid != ctx.gid || st.st_mode & 0o020 == 0)
2409                 && st.st_mode & 0o002 == 0
2410             {
2411                 return Err(io::Error::from_raw_os_error(libc::EACCES));
2412             }
2413         }
2414 
2415         // root can only execute something if it is executable by one of the owner, the group, or
2416         // everyone.
2417         if (mode & libc::X_OK) != 0 {
2418             if (ctx.uid != 0 || st.st_mode & 0o111 == 0)
2419                 && (st.st_uid != ctx.uid || st.st_mode & 0o100 == 0)
2420                 && (st.st_gid != ctx.gid || st.st_mode & 0o010 == 0)
2421                 && st.st_mode & 0o001 == 0
2422             {
2423                 return Err(io::Error::from_raw_os_error(libc::EACCES));
2424             }
2425         }
2426 
2427         Ok(())
2428     }
2429 
setxattr( &self, _ctx: Context, inode: Inode, name: &CStr, value: &[u8], flags: u32, ) -> io::Result<()>2430     fn setxattr(
2431         &self,
2432         _ctx: Context,
2433         inode: Inode,
2434         name: &CStr,
2435         value: &[u8],
2436         flags: u32,
2437     ) -> io::Result<()> {
2438         cros_tracing::trace_simple_print!(
2439             "{}: setxattr: inode={inode}, name={:?}, flags={flags}",
2440             self.tag,
2441             name
2442         );
2443         // We can't allow the VM to set this xattr because an unprivileged process may use it to set
2444         // a privileged xattr.
2445         if self.cfg.rewrite_security_xattrs && name.to_bytes().starts_with(USER_VIRTIOFS_XATTR) {
2446             return Err(io::Error::from_raw_os_error(libc::EPERM));
2447         }
2448 
2449         let data = self.find_inode(inode)?;
2450         let name = self.rewrite_xattr_name(name);
2451         let file = data.file.lock();
2452         let o_path_file = (file.1 & libc::O_PATH) != 0;
2453         if o_path_file {
2454             // For FDs opened with `O_PATH`, we cannot call `fsetxattr` normally. Instead we emulate
2455             // an _at syscall by changing the CWD to /proc, running the path based syscall, and then
2456             // setting the CWD back to the root directory.
2457             let path = CString::new(format!("self/fd/{}", file.0.as_raw_descriptor()))
2458                 .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
2459 
2460             // Safe because this doesn't modify any memory and we check the return value.
2461             syscall!(self.with_proc_chdir(|| {
2462                 unsafe {
2463                     libc::setxattr(
2464                         path.as_ptr(),
2465                         name.as_ptr(),
2466                         value.as_ptr() as *const libc::c_void,
2467                         value.len() as libc::size_t,
2468                         flags as c_int,
2469                     )
2470                 }
2471             }))?;
2472         } else {
2473             // For regular files and directories, we can just use fsetxattr. Safe because this
2474             // doesn't modify any memory and we check the return value.
2475             syscall!(unsafe {
2476                 libc::fsetxattr(
2477                     file.0.as_raw_descriptor(),
2478                     name.as_ptr(),
2479                     value.as_ptr() as *const libc::c_void,
2480                     value.len() as libc::size_t,
2481                     flags as c_int,
2482                 )
2483             })?;
2484         }
2485 
2486         Ok(())
2487     }
2488 
getxattr( &self, _ctx: Context, inode: Inode, name: &CStr, size: u32, ) -> io::Result<GetxattrReply>2489     fn getxattr(
2490         &self,
2491         _ctx: Context,
2492         inode: Inode,
2493         name: &CStr,
2494         size: u32,
2495     ) -> io::Result<GetxattrReply> {
2496         cros_tracing::trace_simple_print!(
2497             "{}: getxattr: inode={inode}, name={:?}, size={size}",
2498             self.tag,
2499             name
2500         );
2501         // We don't allow the VM to set this xattr so we also pretend there is no value associated
2502         // with it.
2503         if self.cfg.rewrite_security_xattrs && name.to_bytes().starts_with(USER_VIRTIOFS_XATTR) {
2504             return Err(io::Error::from_raw_os_error(libc::ENODATA));
2505         }
2506 
2507         let data = self.find_inode(inode)?;
2508         let name = self.rewrite_xattr_name(name);
2509         let mut buf = vec![0u8; size as usize];
2510 
2511         // Safe because this will only modify the contents of `buf`.
2512         let res = self.do_getxattr(&data, &name, &mut buf[..])?;
2513         if size == 0 {
2514             Ok(GetxattrReply::Count(res as u32))
2515         } else {
2516             buf.truncate(res as usize);
2517             Ok(GetxattrReply::Value(buf))
2518         }
2519     }
2520 
listxattr(&self, _ctx: Context, inode: Inode, size: u32) -> io::Result<ListxattrReply>2521     fn listxattr(&self, _ctx: Context, inode: Inode, size: u32) -> io::Result<ListxattrReply> {
2522         cros_tracing::trace_simple_print!("{}: listxattr: inode={inode}, size={size}", self.tag);
2523         let data = self.find_inode(inode)?;
2524 
2525         let mut buf = vec![0u8; size as usize];
2526 
2527         let file = data.file.lock();
2528         let o_path_file = (file.1 & libc::O_PATH) != 0;
2529         let res = if o_path_file {
2530             // For FDs opened with `O_PATH`, we cannot call `flistxattr` normally. Instead we
2531             // emulate an _at syscall by changing the CWD to /proc, running the path based syscall,
2532             // and then setting the CWD back to the root directory.
2533             let path = CString::new(format!("self/fd/{}", file.0.as_raw_descriptor()))
2534                 .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
2535 
2536             // Safe because this will only modify `buf` and we check the return value.
2537             syscall!(self.with_proc_chdir(|| unsafe {
2538                 libc::listxattr(
2539                     path.as_ptr(),
2540                     buf.as_mut_ptr() as *mut libc::c_char,
2541                     buf.len() as libc::size_t,
2542                 )
2543             }))?
2544         } else {
2545             // For regular files and directories, we can just flistxattr. Safe because this will only
2546             // write to `buf` and we check the return value.
2547             syscall!(unsafe {
2548                 libc::flistxattr(
2549                     file.0.as_raw_descriptor(),
2550                     buf.as_mut_ptr() as *mut libc::c_char,
2551                     buf.len() as libc::size_t,
2552                 )
2553             })?
2554         };
2555 
2556         if size == 0 {
2557             Ok(ListxattrReply::Count(res as u32))
2558         } else {
2559             buf.truncate(res as usize);
2560 
2561             if self.cfg.rewrite_security_xattrs {
2562                 strip_xattr_prefix(&mut buf);
2563             }
2564             Ok(ListxattrReply::Names(buf))
2565         }
2566     }
2567 
removexattr(&self, _ctx: Context, inode: Inode, name: &CStr) -> io::Result<()>2568     fn removexattr(&self, _ctx: Context, inode: Inode, name: &CStr) -> io::Result<()> {
2569         cros_tracing::trace_simple_print!(
2570             "{}: removexattr: inode={inode}, name={:?}",
2571             self.tag,
2572             name
2573         );
2574         // We don't allow the VM to set this xattr so we also pretend there is no value associated
2575         // with it.
2576         if self.cfg.rewrite_security_xattrs && name.to_bytes().starts_with(USER_VIRTIOFS_XATTR) {
2577             return Err(io::Error::from_raw_os_error(libc::ENODATA));
2578         }
2579 
2580         let data = self.find_inode(inode)?;
2581         let name = self.rewrite_xattr_name(name);
2582 
2583         let file = data.file.lock();
2584         let o_path_file = (file.1 & libc::O_PATH) != 0;
2585         if o_path_file {
2586             // For files opened with `O_PATH`, we cannot call `fremovexattr` normally. Instead we
2587             // emulate an _at syscall by changing the CWD to /proc, running the path based syscall,
2588             // and then setting the CWD back to the root directory.
2589             let path = CString::new(format!("self/fd/{}", file.0.as_raw_descriptor()))
2590                 .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
2591 
2592             // Safe because this doesn't modify any memory and we check the return value.
2593             syscall!(
2594                 self.with_proc_chdir(|| unsafe { libc::removexattr(path.as_ptr(), name.as_ptr()) })
2595             )?;
2596         } else {
2597             // For regular files and directories, we can just use fremovexattr. Safe because this
2598             // doesn't modify any memory and we check the return value.
2599             syscall!(unsafe { libc::fremovexattr(file.0.as_raw_descriptor(), name.as_ptr()) })?;
2600         }
2601 
2602         Ok(())
2603     }
2604 
fallocate( &self, _ctx: Context, inode: Inode, handle: Handle, mode: u32, offset: u64, length: u64, ) -> io::Result<()>2605     fn fallocate(
2606         &self,
2607         _ctx: Context,
2608         inode: Inode,
2609         handle: Handle,
2610         mode: u32,
2611         offset: u64,
2612         length: u64,
2613     ) -> io::Result<()> {
2614         cros_tracing::trace_simple_print!(
2615             "{}: fallocate: inode={inode}, handle={handle}, mode={mode}, offset={offset}, lenght={length}",
2616             self.tag
2617         );
2618 
2619         let data: Arc<dyn AsRawDescriptor> = if self.zero_message_open.load(Ordering::Relaxed) {
2620             let data = self.find_inode(inode)?;
2621 
2622             {
2623                 // fallocate needs a writable fd
2624                 let mut file = data.file.lock();
2625                 let mut flags = file.1;
2626                 match flags & libc::O_ACCMODE {
2627                     libc::O_RDONLY => {
2628                         flags &= !libc::O_RDONLY;
2629                         flags |= libc::O_RDWR;
2630 
2631                         // We need to get a writable handle for this file.
2632                         let newfile = self.open_fd(file.0.as_raw_descriptor(), libc::O_RDWR)?;
2633                         *file = (newfile, flags);
2634                     }
2635                     libc::O_WRONLY | libc::O_RDWR => {}
2636                     _ => panic!("Unexpected flags: {:#x}", flags),
2637                 }
2638             }
2639 
2640             data
2641         } else {
2642             self.find_handle(handle, inode)?
2643         };
2644 
2645         let fd = data.as_raw_descriptor();
2646         // Safe because this doesn't modify any memory and we check the return value.
2647         syscall!(unsafe {
2648             libc::fallocate64(
2649                 fd,
2650                 mode as libc::c_int,
2651                 offset as libc::off64_t,
2652                 length as libc::off64_t,
2653             )
2654         })?;
2655 
2656         Ok(())
2657     }
2658 
ioctl<R: io::Read>( &self, ctx: Context, inode: Inode, handle: Handle, _flags: IoctlFlags, cmd: u32, _arg: u64, in_size: u32, out_size: u32, r: R, ) -> io::Result<IoctlReply>2659     fn ioctl<R: io::Read>(
2660         &self,
2661         ctx: Context,
2662         inode: Inode,
2663         handle: Handle,
2664         _flags: IoctlFlags,
2665         cmd: u32,
2666         _arg: u64,
2667         in_size: u32,
2668         out_size: u32,
2669         r: R,
2670     ) -> io::Result<IoctlReply> {
2671         cros_tracing::trace_simple_print!(
2672             "{}: ioctl: inode={inode}, handle={handle}, cmd={cmd}, in_size={in_size}, out_size={out_size}",
2673             self.tag
2674         );
2675 
2676         const GET_ENCRYPTION_POLICY_EX: u32 = FS_IOC_GET_ENCRYPTION_POLICY_EX() as u32;
2677         const GET_FSXATTR: u32 = FS_IOC_FSGETXATTR() as u32;
2678         const SET_FSXATTR: u32 = FS_IOC_FSSETXATTR() as u32;
2679         const GET_FLAGS32: u32 = FS_IOC32_GETFLAGS() as u32;
2680         const SET_FLAGS32: u32 = FS_IOC32_SETFLAGS() as u32;
2681         const GET_FLAGS64: u32 = FS_IOC64_GETFLAGS() as u32;
2682         const SET_FLAGS64: u32 = FS_IOC64_SETFLAGS() as u32;
2683         const ENABLE_VERITY: u32 = FS_IOC_ENABLE_VERITY() as u32;
2684         const MEASURE_VERITY: u32 = FS_IOC_MEASURE_VERITY() as u32;
2685 
2686         match cmd {
2687             GET_ENCRYPTION_POLICY_EX => self.get_encryption_policy_ex(inode, handle, r),
2688             GET_FSXATTR => {
2689                 if out_size < size_of::<fsxattr>() as u32 {
2690                     Err(io::Error::from_raw_os_error(libc::ENOMEM))
2691                 } else {
2692                     self.get_fsxattr(inode, handle)
2693                 }
2694             }
2695             SET_FSXATTR => {
2696                 if in_size < size_of::<fsxattr>() as u32 {
2697                     Err(io::Error::from_raw_os_error(libc::EINVAL))
2698                 } else {
2699                     self.set_fsxattr(ctx, inode, handle, r)
2700                 }
2701             }
2702             GET_FLAGS32 | GET_FLAGS64 => {
2703                 if out_size < size_of::<c_int>() as u32 {
2704                     Err(io::Error::from_raw_os_error(libc::ENOMEM))
2705                 } else {
2706                     self.get_flags(inode, handle)
2707                 }
2708             }
2709             SET_FLAGS32 | SET_FLAGS64 => {
2710                 if in_size < size_of::<c_int>() as u32 {
2711                     Err(io::Error::from_raw_os_error(libc::ENOMEM))
2712                 } else {
2713                     self.set_flags(ctx, inode, handle, r)
2714                 }
2715             }
2716             ENABLE_VERITY => {
2717                 if in_size < size_of::<fsverity_enable_arg>() as u32 {
2718                     Err(io::Error::from_raw_os_error(libc::ENOMEM))
2719                 } else {
2720                     self.enable_verity(inode, handle, r)
2721                 }
2722             }
2723             MEASURE_VERITY => {
2724                 if in_size < size_of::<fsverity_digest>() as u32
2725                     || out_size < size_of::<fsverity_digest>() as u32
2726                 {
2727                     Err(io::Error::from_raw_os_error(libc::ENOMEM))
2728                 } else {
2729                     self.measure_verity(inode, handle, r, out_size)
2730                 }
2731             }
2732             _ => Err(io::Error::from_raw_os_error(libc::ENOTTY)),
2733         }
2734     }
2735 
copy_file_range( &self, ctx: Context, inode_src: Inode, handle_src: Handle, offset_src: u64, inode_dst: Inode, handle_dst: Handle, offset_dst: u64, length: u64, flags: u64, ) -> io::Result<usize>2736     fn copy_file_range(
2737         &self,
2738         ctx: Context,
2739         inode_src: Inode,
2740         handle_src: Handle,
2741         offset_src: u64,
2742         inode_dst: Inode,
2743         handle_dst: Handle,
2744         offset_dst: u64,
2745         length: u64,
2746         flags: u64,
2747     ) -> io::Result<usize> {
2748         cros_tracing::trace_simple_print!(
2749             "{}: copy_file_range: src=({inode_src}, {handle_src}, {offset_src}), dst=({inode_dst}, {handle_dst}, {offset_dst}), length={length}, flags={flags}",
2750             self.tag
2751         );
2752         // We need to change credentials during a write so that the kernel will remove setuid or
2753         // setgid bits from the file if it was written to by someone other than the owner.
2754         let (_uid, _gid) = set_creds(ctx.uid, ctx.gid)?;
2755         let (src_data, dst_data): (Arc<dyn AsRawDescriptor>, Arc<dyn AsRawDescriptor>) =
2756             if self.zero_message_open.load(Ordering::Relaxed) {
2757                 (self.find_inode(inode_src)?, self.find_inode(inode_dst)?)
2758             } else {
2759                 (
2760                     self.find_handle(handle_src, inode_src)?,
2761                     self.find_handle(handle_dst, inode_dst)?,
2762                 )
2763             };
2764 
2765         let src = src_data.as_raw_descriptor();
2766         let dst = dst_data.as_raw_descriptor();
2767 
2768         Ok(syscall!(unsafe {
2769             libc::syscall(
2770                 libc::SYS_copy_file_range,
2771                 src,
2772                 &offset_src,
2773                 dst,
2774                 &offset_dst,
2775                 length,
2776                 flags,
2777             )
2778         })? as usize)
2779     }
2780 
set_up_mapping<M: Mapper>( &self, _ctx: Context, inode: Self::Inode, _handle: Self::Handle, file_offset: u64, mem_offset: u64, size: usize, prot: u32, mapper: M, ) -> io::Result<()>2781     fn set_up_mapping<M: Mapper>(
2782         &self,
2783         _ctx: Context,
2784         inode: Self::Inode,
2785         _handle: Self::Handle,
2786         file_offset: u64,
2787         mem_offset: u64,
2788         size: usize,
2789         prot: u32,
2790         mapper: M,
2791     ) -> io::Result<()> {
2792         cros_tracing::trace_simple_print!(
2793             "{}: set_up_mapping: inode={inode}, file_offset={file_offset}, mem_offset={mem_offset}, size={size}, prot={prot}",
2794             self.tag
2795         );
2796         if !self.cfg.use_dax {
2797             return Err(io::Error::from_raw_os_error(libc::ENOSYS));
2798         }
2799 
2800         let read = prot & libc::PROT_READ as u32 != 0;
2801         let write = prot & libc::PROT_WRITE as u32 != 0;
2802         let mmap_flags = match (read, write) {
2803             (true, true) => libc::O_RDWR,
2804             (true, false) => libc::O_RDONLY,
2805             (false, true) => libc::O_RDWR, // mmap always requires an fd opened for reading.
2806             (false, false) => return Err(io::Error::from_raw_os_error(libc::EINVAL)),
2807         };
2808 
2809         let data = self.find_inode(inode)?;
2810 
2811         if self.zero_message_open.load(Ordering::Relaxed) {
2812             let mut file = data.file.lock();
2813             let mut open_flags = file.1;
2814             match (mmap_flags, open_flags & libc::O_ACCMODE) {
2815                 (libc::O_RDONLY, libc::O_WRONLY)
2816                 | (libc::O_RDWR, libc::O_RDONLY)
2817                 | (libc::O_RDWR, libc::O_WRONLY) => {
2818                     // We have a read-only or write-only fd and we need to upgrade it.
2819                     open_flags &= !libc::O_ACCMODE;
2820                     open_flags |= libc::O_RDWR;
2821 
2822                     let newfile = self.open_fd(file.0.as_raw_descriptor(), libc::O_RDWR)?;
2823                     *file = (newfile, open_flags);
2824                 }
2825                 (libc::O_RDONLY, libc::O_RDONLY)
2826                 | (libc::O_RDONLY, libc::O_RDWR)
2827                 | (libc::O_RDWR, libc::O_RDWR) => {}
2828                 (m, o) => panic!(
2829                     "Unexpected combination of access flags: ({:#x}, {:#x})",
2830                     m, o
2831                 ),
2832             }
2833             mapper.map(mem_offset, size, &file.0, file_offset, prot)
2834         } else {
2835             let file = self.open_inode(&data, mmap_flags | libc::O_NONBLOCK)?;
2836             mapper.map(mem_offset, size, &file, file_offset, prot)
2837         }
2838     }
2839 
remove_mapping<M: Mapper>(&self, msgs: &[RemoveMappingOne], mapper: M) -> io::Result<()>2840     fn remove_mapping<M: Mapper>(&self, msgs: &[RemoveMappingOne], mapper: M) -> io::Result<()> {
2841         cros_tracing::trace_simple_print!("{}: remove_mapping: msgs={:?}", self.tag, msgs);
2842         if !self.cfg.use_dax {
2843             return Err(io::Error::from_raw_os_error(libc::ENOSYS));
2844         }
2845 
2846         for RemoveMappingOne { moffset, len } in msgs {
2847             mapper.unmap(*moffset, *len)?;
2848         }
2849         Ok(())
2850     }
2851 }
2852 
2853 #[cfg(test)]
2854 mod tests {
2855     use super::*;
2856 
2857     #[test]
rewrite_xattr_names()2858     fn rewrite_xattr_names() {
2859         let cfg = Config {
2860             rewrite_security_xattrs: true,
2861             ..Default::default()
2862         };
2863 
2864         let p = PassthroughFs::new("tag", cfg).expect("Failed to create PassthroughFs");
2865 
2866         // Selinux shouldn't get overwritten.
2867         let selinux = unsafe { CStr::from_bytes_with_nul_unchecked(b"security.selinux\0") };
2868         assert_eq!(p.rewrite_xattr_name(selinux).to_bytes(), selinux.to_bytes());
2869 
2870         // user, trusted, and system should not be changed either.
2871         let user = unsafe { CStr::from_bytes_with_nul_unchecked(b"user.foobar\0") };
2872         assert_eq!(p.rewrite_xattr_name(user).to_bytes(), user.to_bytes());
2873         let trusted = unsafe { CStr::from_bytes_with_nul_unchecked(b"trusted.foobar\0") };
2874         assert_eq!(p.rewrite_xattr_name(trusted).to_bytes(), trusted.to_bytes());
2875         let system = unsafe { CStr::from_bytes_with_nul_unchecked(b"system.foobar\0") };
2876         assert_eq!(p.rewrite_xattr_name(system).to_bytes(), system.to_bytes());
2877 
2878         // sehash should be re-written.
2879         let sehash = unsafe { CStr::from_bytes_with_nul_unchecked(b"security.sehash\0") };
2880         assert_eq!(
2881             p.rewrite_xattr_name(sehash).to_bytes(),
2882             b"user.virtiofs.security.sehash"
2883         );
2884     }
2885 
2886     #[test]
strip_xattr_names()2887     fn strip_xattr_names() {
2888         let only_nuls = b"\0\0\0\0\0";
2889         let mut actual = only_nuls.to_vec();
2890         strip_xattr_prefix(&mut actual);
2891         assert_eq!(&actual[..], &only_nuls[..]);
2892 
2893         let no_nuls = b"security.sehashuser.virtiofs";
2894         let mut actual = no_nuls.to_vec();
2895         strip_xattr_prefix(&mut actual);
2896         assert_eq!(&actual[..], &no_nuls[..]);
2897 
2898         let empty = b"";
2899         let mut actual = empty.to_vec();
2900         strip_xattr_prefix(&mut actual);
2901         assert_eq!(&actual[..], &empty[..]);
2902 
2903         let no_strippable_names = b"security.selinux\0user.foobar\0system.test\0";
2904         let mut actual = no_strippable_names.to_vec();
2905         strip_xattr_prefix(&mut actual);
2906         assert_eq!(&actual[..], &no_strippable_names[..]);
2907 
2908         let only_strippable_names = b"user.virtiofs.security.sehash\0user.virtiofs.security.wat\0";
2909         let mut actual = only_strippable_names.to_vec();
2910         strip_xattr_prefix(&mut actual);
2911         assert_eq!(&actual[..], b"security.sehash\0security.wat\0");
2912 
2913         let mixed_names = b"user.virtiofs.security.sehash\0security.selinux\0user.virtiofs.security.wat\0user.foobar\0";
2914         let mut actual = mixed_names.to_vec();
2915         strip_xattr_prefix(&mut actual);
2916         let expected = b"security.sehash\0security.selinux\0security.wat\0user.foobar\0";
2917         assert_eq!(&actual[..], &expected[..]);
2918 
2919         let no_nul_with_prefix = b"user.virtiofs.security.sehash";
2920         let mut actual = no_nul_with_prefix.to_vec();
2921         strip_xattr_prefix(&mut actual);
2922         assert_eq!(&actual[..], b"security.sehash");
2923     }
2924 }
2925