1 // Copyright 2019 The Chromium OS Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 use std::borrow::Cow;
6 use std::cmp;
7 use std::collections::btree_map;
8 use std::collections::BTreeMap;
9 use std::ffi::{CStr, CString};
10 use std::fs::File;
11 use std::io;
12 use std::mem::{self, size_of, MaybeUninit};
13 use std::os::raw::{c_int, c_long};
14 use std::str::FromStr;
15 use std::sync::atomic::{AtomicBool, AtomicU64, Ordering};
16 use std::sync::Arc;
17 use std::time::Duration;
18
19 use base::{
20 error, ioctl_ior_nr, ioctl_iow_nr, ioctl_iowr_nr, ioctl_with_mut_ptr, ioctl_with_ptr,
21 AsRawDescriptor, FromRawDescriptor, RawDescriptor,
22 };
23 use data_model::DataInit;
24 use fuse::filesystem::{
25 Context, DirectoryIterator, Entry, FileSystem, FsOptions, GetxattrReply, IoctlFlags,
26 IoctlReply, ListxattrReply, OpenOptions, RemoveMappingOne, SetattrValid, ZeroCopyReader,
27 ZeroCopyWriter, ROOT_ID,
28 };
29 use fuse::sys::WRITE_KILL_PRIV;
30 use fuse::Mapper;
31 use sync::Mutex;
32
33 use crate::virtio::fs::caps::{Capability, Caps, Set as CapSet, Value as CapValue};
34 use crate::virtio::fs::multikey::MultikeyBTreeMap;
35 use crate::virtio::fs::read_dir::ReadDir;
36
37 const EMPTY_CSTR: &[u8] = b"\0";
38 const ROOT_CSTR: &[u8] = b"/\0";
39 const PROC_CSTR: &[u8] = b"/proc\0";
40
41 const USER_VIRTIOFS_XATTR: &[u8] = b"user.virtiofs.";
42 const SECURITY_XATTR: &[u8] = b"security.";
43 const SELINUX_XATTR: &[u8] = b"security.selinux";
44
45 const FSCRYPT_KEY_DESCRIPTOR_SIZE: usize = 8;
46 const FSCRYPT_KEY_IDENTIFIER_SIZE: usize = 16;
47
48 #[repr(C)]
49 #[derive(Clone, Copy)]
50 struct fscrypt_policy_v1 {
51 _version: u8,
52 _contents_encryption_mode: u8,
53 _filenames_encryption_mode: u8,
54 _flags: u8,
55 _master_key_descriptor: [u8; FSCRYPT_KEY_DESCRIPTOR_SIZE],
56 }
57 unsafe impl DataInit for fscrypt_policy_v1 {}
58
59 #[repr(C)]
60 #[derive(Clone, Copy)]
61 struct fscrypt_policy_v2 {
62 _version: u8,
63 _contents_encryption_mode: u8,
64 _filenames_encryption_mode: u8,
65 _flags: u8,
66 __reserved: [u8; 4],
67 master_key_identifier: [u8; FSCRYPT_KEY_IDENTIFIER_SIZE],
68 }
69 unsafe impl DataInit for fscrypt_policy_v2 {}
70
71 #[repr(C)]
72 #[derive(Copy, Clone)]
73 union fscrypt_policy {
74 _version: u8,
75 _v1: fscrypt_policy_v1,
76 _v2: fscrypt_policy_v2,
77 }
78 unsafe impl DataInit for fscrypt_policy {}
79
80 #[repr(C)]
81 #[derive(Copy, Clone)]
82 struct fscrypt_get_policy_ex_arg {
83 policy_size: u64, /* input/output */
84 policy: fscrypt_policy, /* output */
85 }
86 unsafe impl DataInit for fscrypt_get_policy_ex_arg {}
87
88 ioctl_iowr_nr!(FS_IOC_GET_ENCRYPTION_POLICY_EX, 'f' as u32, 22, [u8; 9]);
89
90 #[repr(C)]
91 #[derive(Clone, Copy)]
92 struct fsxattr {
93 _fsx_xflags: u32, /* xflags field value (get/set) */
94 _fsx_extsize: u32, /* extsize field value (get/set)*/
95 _fsx_nextents: u32, /* nextents field value (get) */
96 _fsx_projid: u32, /* project identifier (get/set) */
97 _fsx_cowextsize: u32, /* CoW extsize field value (get/set)*/
98 _fsx_pad: [u8; 8],
99 }
100 unsafe impl DataInit for fsxattr {}
101
102 ioctl_ior_nr!(FS_IOC_FSGETXATTR, 'X' as u32, 31, fsxattr);
103 ioctl_iow_nr!(FS_IOC_FSSETXATTR, 'X' as u32, 32, fsxattr);
104
105 ioctl_ior_nr!(FS_IOC_GETFLAGS, 'f' as u32, 1, c_long);
106 ioctl_iow_nr!(FS_IOC_SETFLAGS, 'f' as u32, 2, c_long);
107
108 ioctl_ior_nr!(FS_IOC32_GETFLAGS, 'f' as u32, 1, u32);
109 ioctl_iow_nr!(FS_IOC32_SETFLAGS, 'f' as u32, 2, u32);
110
111 ioctl_ior_nr!(FS_IOC64_GETFLAGS, 'f' as u32, 1, u64);
112 ioctl_iow_nr!(FS_IOC64_SETFLAGS, 'f' as u32, 2, u64);
113
114 type Inode = u64;
115 type Handle = u64;
116
117 #[derive(Clone, Copy, PartialOrd, Ord, PartialEq, Eq)]
118 struct InodeAltKey {
119 ino: libc::ino64_t,
120 dev: libc::dev_t,
121 }
122
123 #[derive(PartialEq, Eq)]
124 enum FileType {
125 Regular,
126 Directory,
127 Other,
128 }
129
130 impl From<libc::mode_t> for FileType {
from(mode: libc::mode_t) -> Self131 fn from(mode: libc::mode_t) -> Self {
132 match mode & libc::S_IFMT {
133 libc::S_IFREG => FileType::Regular,
134 libc::S_IFDIR => FileType::Directory,
135 _ => FileType::Other,
136 }
137 }
138 }
139
140 struct InodeData {
141 inode: Inode,
142 // (File, open_flags)
143 file: Mutex<(File, libc::c_int)>,
144 refcount: AtomicU64,
145 filetype: FileType,
146 }
147
148 impl AsRawDescriptor for InodeData {
as_raw_descriptor(&self) -> RawDescriptor149 fn as_raw_descriptor(&self) -> RawDescriptor {
150 self.file.lock().0.as_raw_descriptor()
151 }
152 }
153
154 struct HandleData {
155 inode: Inode,
156 file: Mutex<File>,
157 }
158
159 impl AsRawDescriptor for HandleData {
as_raw_descriptor(&self) -> RawDescriptor160 fn as_raw_descriptor(&self) -> RawDescriptor {
161 self.file.lock().as_raw_descriptor()
162 }
163 }
164
165 macro_rules! scoped_cred {
166 ($name:ident, $ty:ty, $syscall_nr:expr) => {
167 #[derive(Debug)]
168 struct $name {
169 old: $ty,
170 }
171
172 impl $name {
173 // Changes the effective uid/gid of the current thread to `val`. Changes the thread's
174 // credentials back to `old` when the returned struct is dropped.
175 fn new(val: $ty, old: $ty) -> io::Result<Option<$name>> {
176 if val == old {
177 // Nothing to do since we already have the correct value.
178 return Ok(None);
179 }
180
181 // We want credential changes to be per-thread because otherwise
182 // we might interfere with operations being carried out on other
183 // threads with different uids/gids. However, posix requires that
184 // all threads in a process share the same credentials. To do this
185 // libc uses signals to ensure that when one thread changes its
186 // credentials the other threads do the same thing.
187 //
188 // So instead we invoke the syscall directly in order to get around
189 // this limitation. Another option is to use the setfsuid and
190 // setfsgid systems calls. However since those calls have no way to
191 // return an error, it's preferable to do this instead.
192
193 // This call is safe because it doesn't modify any memory and we
194 // check the return value.
195 let res = unsafe { libc::syscall($syscall_nr, -1, val, -1) };
196 if res == 0 {
197 Ok(Some($name { old }))
198 } else {
199 Err(io::Error::last_os_error())
200 }
201 }
202 }
203
204 impl Drop for $name {
205 fn drop(&mut self) {
206 let res = unsafe { libc::syscall($syscall_nr, -1, self.old, -1) };
207 if res < 0 {
208 error!(
209 "failed to change credentials back to {}: {}",
210 self.old,
211 io::Error::last_os_error(),
212 );
213 }
214 }
215 }
216 };
217 }
218 #[cfg(not(target_arch = "arm"))]
219 scoped_cred!(ScopedUid, libc::uid_t, libc::SYS_setresuid);
220 #[cfg(target_arch = "arm")]
221 scoped_cred!(ScopedUid, libc::uid_t, libc::SYS_setresuid32);
222
223 #[cfg(not(target_arch = "arm"))]
224 scoped_cred!(ScopedGid, libc::gid_t, libc::SYS_setresgid);
225 #[cfg(target_arch = "arm")]
226 scoped_cred!(ScopedGid, libc::gid_t, libc::SYS_setresgid32);
227
228 #[cfg(not(target_arch = "arm"))]
229 const SYS_GETEUID: libc::c_long = libc::SYS_geteuid;
230 #[cfg(target_arch = "arm")]
231 const SYS_GETEUID: libc::c_long = libc::SYS_geteuid32;
232
233 #[cfg(not(target_arch = "arm"))]
234 const SYS_GETEGID: libc::c_long = libc::SYS_getegid;
235 #[cfg(target_arch = "arm")]
236 const SYS_GETEGID: libc::c_long = libc::SYS_getegid32;
237
238 thread_local! {
239 // Both these calls are safe because they take no parameters, and only return an integer value.
240 // The kernel also guarantees that they can never fail.
241 static THREAD_EUID: libc::uid_t = unsafe { libc::syscall(SYS_GETEUID) as libc::uid_t };
242 static THREAD_EGID: libc::gid_t = unsafe { libc::syscall(SYS_GETEGID) as libc::gid_t };
243 }
244
set_creds( uid: libc::uid_t, gid: libc::gid_t, ) -> io::Result<(Option<ScopedUid>, Option<ScopedGid>)>245 fn set_creds(
246 uid: libc::uid_t,
247 gid: libc::gid_t,
248 ) -> io::Result<(Option<ScopedUid>, Option<ScopedGid>)> {
249 let olduid = THREAD_EUID.with(|uid| *uid);
250 let oldgid = THREAD_EGID.with(|gid| *gid);
251
252 // We have to change the gid before we change the uid because if we change the uid first then we
253 // lose the capability to change the gid. However changing back can happen in any order.
254 ScopedGid::new(gid, oldgid).and_then(|gid| Ok((ScopedUid::new(uid, olduid)?, gid)))
255 }
256
257 struct ScopedUmask<'a> {
258 old: libc::mode_t,
259 mask: libc::mode_t,
260 _factory: &'a mut Umask,
261 }
262
263 impl<'a> Drop for ScopedUmask<'a> {
drop(&mut self)264 fn drop(&mut self) {
265 // Safe because this doesn't modify any memory and always succeeds.
266 let previous = unsafe { libc::umask(self.old) };
267 debug_assert_eq!(
268 previous, self.mask,
269 "umask changed while holding ScopedUmask"
270 );
271 }
272 }
273
274 struct Umask;
275
276 impl Umask {
set(&mut self, mask: libc::mode_t) -> ScopedUmask277 fn set(&mut self, mask: libc::mode_t) -> ScopedUmask {
278 ScopedUmask {
279 // Safe because this doesn't modify any memory and always succeeds.
280 old: unsafe { libc::umask(mask) },
281 mask,
282 _factory: self,
283 }
284 }
285 }
286
287 struct ScopedFsetid(Caps);
288 impl Drop for ScopedFsetid {
drop(&mut self)289 fn drop(&mut self) {
290 if let Err(e) = raise_cap_fsetid(&mut self.0) {
291 error!(
292 "Failed to restore CAP_FSETID: {}. Some operations may be broken.",
293 e
294 )
295 }
296 }
297 }
298
raise_cap_fsetid(c: &mut Caps) -> io::Result<()>299 fn raise_cap_fsetid(c: &mut Caps) -> io::Result<()> {
300 c.update(&[Capability::Fsetid], CapSet::Effective, CapValue::Set)?;
301 c.apply()
302 }
303
304 // Drops CAP_FSETID from the effective set for the current thread and returns an RAII guard that
305 // adds the capability back when it is dropped.
drop_cap_fsetid() -> io::Result<ScopedFsetid>306 fn drop_cap_fsetid() -> io::Result<ScopedFsetid> {
307 let mut caps = Caps::for_current_thread()?;
308 caps.update(&[Capability::Fsetid], CapSet::Effective, CapValue::Clear)?;
309 caps.apply()?;
310 Ok(ScopedFsetid(caps))
311 }
312
ebadf() -> io::Error313 fn ebadf() -> io::Error {
314 io::Error::from_raw_os_error(libc::EBADF)
315 }
316
stat<F: AsRawDescriptor>(f: &F) -> io::Result<libc::stat64>317 fn stat<F: AsRawDescriptor>(f: &F) -> io::Result<libc::stat64> {
318 let mut st = MaybeUninit::<libc::stat64>::zeroed();
319
320 // Safe because this is a constant value and a valid C string.
321 let pathname = unsafe { CStr::from_bytes_with_nul_unchecked(EMPTY_CSTR) };
322
323 // Safe because the kernel will only write data in `st` and we check the return
324 // value.
325 let res = unsafe {
326 libc::fstatat64(
327 f.as_raw_descriptor(),
328 pathname.as_ptr(),
329 st.as_mut_ptr(),
330 libc::AT_EMPTY_PATH | libc::AT_SYMLINK_NOFOLLOW,
331 )
332 };
333 if res >= 0 {
334 // Safe because the kernel guarantees that the struct is now fully initialized.
335 Ok(unsafe { st.assume_init() })
336 } else {
337 Err(io::Error::last_os_error())
338 }
339 }
340
statat<D: AsRawDescriptor>(dir: &D, name: &CStr) -> io::Result<libc::stat64>341 fn statat<D: AsRawDescriptor>(dir: &D, name: &CStr) -> io::Result<libc::stat64> {
342 let mut st = MaybeUninit::<libc::stat64>::zeroed();
343
344 // Safe because the kernel will only write data in `st` and we check the return
345 // value.
346 let res = unsafe {
347 libc::fstatat64(
348 dir.as_raw_descriptor(),
349 name.as_ptr(),
350 st.as_mut_ptr(),
351 libc::AT_SYMLINK_NOFOLLOW,
352 )
353 };
354 if res >= 0 {
355 // Safe because the kernel guarantees that the struct is now fully initialized.
356 Ok(unsafe { st.assume_init() })
357 } else {
358 Err(io::Error::last_os_error())
359 }
360 }
361
362 /// The caching policy that the file system should report to the FUSE client. By default the FUSE
363 /// protocol uses close-to-open consistency. This means that any cached contents of the file are
364 /// invalidated the next time that file is opened.
365 #[derive(Debug, Clone, Eq, PartialEq)]
366 pub enum CachePolicy {
367 /// The client should never cache file data and all I/O should be directly forwarded to the
368 /// server. This policy must be selected when file contents may change without the knowledge of
369 /// the FUSE client (i.e., the file system does not have exclusive access to the directory).
370 Never,
371
372 /// The client is free to choose when and how to cache file data. This is the default policy and
373 /// uses close-to-open consistency as described in the enum documentation.
374 Auto,
375
376 /// The client should always cache file data. This means that the FUSE client will not
377 /// invalidate any cached data that was returned by the file system the last time the file was
378 /// opened. This policy should only be selected when the file system has exclusive access to the
379 /// directory.
380 Always,
381 }
382
383 impl FromStr for CachePolicy {
384 type Err = &'static str;
385
from_str(s: &str) -> Result<Self, Self::Err>386 fn from_str(s: &str) -> Result<Self, Self::Err> {
387 match s {
388 "never" | "Never" | "NEVER" => Ok(CachePolicy::Never),
389 "auto" | "Auto" | "AUTO" => Ok(CachePolicy::Auto),
390 "always" | "Always" | "ALWAYS" => Ok(CachePolicy::Always),
391 _ => Err("invalid cache policy"),
392 }
393 }
394 }
395
396 impl Default for CachePolicy {
default() -> Self397 fn default() -> Self {
398 CachePolicy::Auto
399 }
400 }
401
402 /// Options that configure the behavior of the file system.
403 #[derive(Debug, Clone)]
404 pub struct Config {
405 /// How long the FUSE client should consider directory entries to be valid. If the contents of a
406 /// directory can only be modified by the FUSE client (i.e., the file system has exclusive
407 /// access), then this should be a large value.
408 ///
409 /// The default value for this option is 5 seconds.
410 pub entry_timeout: Duration,
411
412 /// How long the FUSE client should consider file and directory attributes to be valid. If the
413 /// attributes of a file or directory can only be modified by the FUSE client (i.e., the file
414 /// system has exclusive access), then this should be set to a large value.
415 ///
416 /// The default value for this option is 5 seconds.
417 pub attr_timeout: Duration,
418
419 /// The caching policy the file system should use. See the documentation of `CachePolicy` for
420 /// more details.
421 pub cache_policy: CachePolicy,
422
423 /// Whether the file system should enabled writeback caching. This can improve performance as it
424 /// allows the FUSE client to cache and coalesce multiple writes before sending them to the file
425 /// system. However, enabling this option can increase the risk of data corruption if the file
426 /// contents can change without the knowledge of the FUSE client (i.e., the server does **NOT**
427 /// have exclusive access). Additionally, the file system should have read access to all files
428 /// in the directory it is serving as the FUSE client may send read requests even for files
429 /// opened with `O_WRONLY`.
430 ///
431 /// Therefore callers should only enable this option when they can guarantee that: 1) the file
432 /// system has exclusive access to the directory and 2) the file system has read permissions for
433 /// all files in that directory.
434 ///
435 /// The default value for this option is `false`.
436 pub writeback: bool,
437
438 /// Controls whether security.* xattrs (except for security.selinux) are re-written. When this
439 /// is set to true, the server will add a "user.virtiofs" prefix to xattrs in the security
440 /// namespace. Setting these xattrs requires CAP_SYS_ADMIN in the namespace where the file
441 /// system was mounted and since the server usually runs in an unprivileged user namespace, it's
442 /// unlikely to have that capability.
443 ///
444 /// The default value for this option is `false`.
445 pub rewrite_security_xattrs: bool,
446
447 /// Use case-insensitive lookups for directory entries (ASCII only).
448 ///
449 /// The default value for this option is `false`.
450 pub ascii_casefold: bool,
451 }
452
453 impl Default for Config {
default() -> Self454 fn default() -> Self {
455 Config {
456 entry_timeout: Duration::from_secs(5),
457 attr_timeout: Duration::from_secs(5),
458 cache_policy: Default::default(),
459 writeback: false,
460 rewrite_security_xattrs: false,
461 ascii_casefold: false,
462 }
463 }
464 }
465
466 /// A file system that simply "passes through" all requests it receives to the underlying file
467 /// system. To keep the implementation simple it servers the contents of its root directory. Users
468 /// that wish to serve only a specific directory should set up the environment so that that
469 /// directory ends up as the root of the file system process. One way to accomplish this is via a
470 /// combination of mount namespaces and the pivot_root system call.
471 pub struct PassthroughFs {
472 // File descriptors for various points in the file system tree.
473 inodes: Mutex<MultikeyBTreeMap<Inode, InodeAltKey, Arc<InodeData>>>,
474 next_inode: AtomicU64,
475
476 // File descriptors for open files and directories. Unlike the fds in `inodes`, these _can_ be
477 // used for reading and writing data.
478 handles: Mutex<BTreeMap<Handle, Arc<HandleData>>>,
479 next_handle: AtomicU64,
480
481 // File descriptor pointing to the `/proc` directory. This is used to convert an fd from
482 // `inodes` into one that can go into `handles`. This is accomplished by reading the
483 // `self/fd/{}` symlink. We keep an open fd here in case the file system tree that we are meant
484 // to be serving doesn't have access to `/proc`.
485 proc: File,
486
487 // Whether writeback caching is enabled for this directory. This will only be true when
488 // `cfg.writeback` is true and `init` was called with `FsOptions::WRITEBACK_CACHE`.
489 writeback: AtomicBool,
490
491 // Whether zero message opens are supported by the kernel driver.
492 zero_message_open: AtomicBool,
493
494 // Whether zero message opendir is supported by the kernel driver.
495 zero_message_opendir: AtomicBool,
496
497 // Used to ensure that only one thread at a time uses chdir(). Since chdir() affects the
498 // process-wide CWD, we cannot allow more than one thread to do it at the same time.
499 chdir_mutex: Mutex<()>,
500
501 // Used when creating files / directories / nodes. Since the umask is process-wide, we can only
502 // allow one thread at a time to change it.
503 umask: Mutex<Umask>,
504
505 cfg: Config,
506 }
507
508 impl PassthroughFs {
new(cfg: Config) -> io::Result<PassthroughFs>509 pub fn new(cfg: Config) -> io::Result<PassthroughFs> {
510 // Safe because this is a constant value and a valid C string.
511 let proc_cstr = unsafe { CStr::from_bytes_with_nul_unchecked(PROC_CSTR) };
512
513 // Safe because this doesn't modify any memory and we check the return value.
514 let raw_descriptor = unsafe {
515 libc::openat(
516 libc::AT_FDCWD,
517 proc_cstr.as_ptr(),
518 libc::O_PATH | libc::O_NOFOLLOW | libc::O_CLOEXEC,
519 )
520 };
521 if raw_descriptor < 0 {
522 return Err(io::Error::last_os_error());
523 }
524
525 // Safe because we just opened this descriptor.
526 let proc = unsafe { File::from_raw_descriptor(raw_descriptor) };
527
528 Ok(PassthroughFs {
529 inodes: Mutex::new(MultikeyBTreeMap::new()),
530 next_inode: AtomicU64::new(ROOT_ID + 1),
531
532 handles: Mutex::new(BTreeMap::new()),
533 next_handle: AtomicU64::new(1),
534
535 proc,
536
537 writeback: AtomicBool::new(false),
538 zero_message_open: AtomicBool::new(false),
539 zero_message_opendir: AtomicBool::new(false),
540
541 chdir_mutex: Mutex::new(()),
542 umask: Mutex::new(Umask),
543 cfg,
544 })
545 }
546
keep_rds(&self) -> Vec<RawDescriptor>547 pub fn keep_rds(&self) -> Vec<RawDescriptor> {
548 vec![self.proc.as_raw_descriptor()]
549 }
550
rewrite_xattr_name<'xattr>(&self, name: &'xattr CStr) -> Cow<'xattr, CStr>551 fn rewrite_xattr_name<'xattr>(&self, name: &'xattr CStr) -> Cow<'xattr, CStr> {
552 if !self.cfg.rewrite_security_xattrs {
553 return Cow::Borrowed(name);
554 }
555
556 // Does not include nul-terminator.
557 let buf = name.to_bytes();
558 if !buf.starts_with(SECURITY_XATTR) || buf == SELINUX_XATTR {
559 return Cow::Borrowed(name);
560 }
561
562 let mut newname = USER_VIRTIOFS_XATTR.to_vec();
563 newname.extend_from_slice(buf);
564
565 // The unwrap is safe here because the prefix doesn't contain any interior nul-bytes and the
566 // to_bytes() call above will not return a byte slice with any interior nul-bytes either.
567 Cow::Owned(CString::new(newname).expect("Failed to re-write xattr name"))
568 }
569
find_inode(&self, inode: Inode) -> io::Result<Arc<InodeData>>570 fn find_inode(&self, inode: Inode) -> io::Result<Arc<InodeData>> {
571 self.inodes
572 .lock()
573 .get(&inode)
574 .map(Arc::clone)
575 .ok_or_else(ebadf)
576 }
577
find_handle(&self, handle: Handle, inode: Inode) -> io::Result<Arc<HandleData>>578 fn find_handle(&self, handle: Handle, inode: Inode) -> io::Result<Arc<HandleData>> {
579 self.handles
580 .lock()
581 .get(&handle)
582 .filter(|hd| hd.inode == inode)
583 .map(Arc::clone)
584 .ok_or_else(ebadf)
585 }
586
open_fd(&self, fd: RawDescriptor, flags: i32) -> io::Result<File>587 fn open_fd(&self, fd: RawDescriptor, flags: i32) -> io::Result<File> {
588 let pathname = CString::new(format!("self/fd/{}", fd))
589 .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
590
591 // Safe because this doesn't modify any memory and we check the return value. We don't
592 // really check `flags` because if the kernel can't handle poorly specified flags then we
593 // have much bigger problems. Also, clear the `O_NOFOLLOW` flag if it is set since we need
594 // to follow the `/proc/self/fd` symlink to get the file.
595 let raw_descriptor = unsafe {
596 libc::openat(
597 self.proc.as_raw_descriptor(),
598 pathname.as_ptr(),
599 (flags | libc::O_CLOEXEC) & !(libc::O_NOFOLLOW | libc::O_DIRECT),
600 )
601 };
602 if raw_descriptor < 0 {
603 return Err(io::Error::last_os_error());
604 }
605
606 // Safe because we just opened this descriptor.
607 Ok(unsafe { File::from_raw_descriptor(raw_descriptor) })
608 }
609
open_inode(&self, inode: &InodeData, mut flags: i32) -> io::Result<File>610 fn open_inode(&self, inode: &InodeData, mut flags: i32) -> io::Result<File> {
611 // When writeback caching is enabled, the kernel may send read requests even if the
612 // userspace program opened the file write-only. So we need to ensure that we have opened
613 // the file for reading as well as writing.
614 let writeback = self.writeback.load(Ordering::Relaxed);
615 if writeback && flags & libc::O_ACCMODE == libc::O_WRONLY {
616 flags &= !libc::O_ACCMODE;
617 flags |= libc::O_RDWR;
618 }
619
620 // When writeback caching is enabled the kernel is responsible for handling `O_APPEND`.
621 // However, this breaks atomicity as the file may have changed on disk, invalidating the
622 // cached copy of the data in the kernel and the offset that the kernel thinks is the end of
623 // the file. Just allow this for now as it is the user's responsibility to enable writeback
624 // caching only for directories that are not shared. It also means that we need to clear the
625 // `O_APPEND` flag.
626 if writeback && flags & libc::O_APPEND != 0 {
627 flags &= !libc::O_APPEND;
628 }
629
630 self.open_fd(inode.as_raw_descriptor(), flags)
631 }
632
633 // Creates a new entry for `f` or increases the refcount of the existing entry for `f`.
add_entry(&self, f: File, st: libc::stat64, open_flags: libc::c_int) -> Entry634 fn add_entry(&self, f: File, st: libc::stat64, open_flags: libc::c_int) -> Entry {
635 let altkey = InodeAltKey {
636 ino: st.st_ino,
637 dev: st.st_dev,
638 };
639 let data = self.inodes.lock().get_alt(&altkey).map(Arc::clone);
640
641 let inode = if let Some(data) = data {
642 // Matches with the release store in `forget`.
643 data.refcount.fetch_add(1, Ordering::Acquire);
644 data.inode
645 } else {
646 // There is a possible race here where 2 threads end up adding the same file
647 // into the inode list. However, since each of those will get a unique Inode
648 // value and unique file descriptors this shouldn't be that much of a problem.
649 let inode = self.next_inode.fetch_add(1, Ordering::Relaxed);
650 self.inodes.lock().insert(
651 inode,
652 InodeAltKey {
653 ino: st.st_ino,
654 dev: st.st_dev,
655 },
656 Arc::new(InodeData {
657 inode,
658 file: Mutex::new((f, open_flags)),
659 refcount: AtomicU64::new(1),
660 filetype: st.st_mode.into(),
661 }),
662 );
663
664 inode
665 };
666
667 Entry {
668 inode,
669 generation: 0,
670 attr: st,
671 attr_timeout: self.cfg.attr_timeout,
672 entry_timeout: self.cfg.entry_timeout,
673 }
674 }
675
676 // Performs an ascii case insensitive lookup.
ascii_casefold_lookup(&self, parent: &InodeData, name: &[u8]) -> io::Result<Entry>677 fn ascii_casefold_lookup(&self, parent: &InodeData, name: &[u8]) -> io::Result<Entry> {
678 let mut buf = [0u8; 1024];
679 let mut offset = 0;
680 loop {
681 let mut read_dir = ReadDir::new(parent, offset, &mut buf[..])?;
682 if read_dir.remaining() == 0 {
683 break;
684 }
685
686 while let Some(entry) = read_dir.next() {
687 offset = entry.offset as libc::off64_t;
688 if name.eq_ignore_ascii_case(entry.name.to_bytes()) {
689 return self.do_lookup(parent, entry.name);
690 }
691 }
692 }
693 Err(io::Error::from_raw_os_error(libc::ENOENT))
694 }
695
do_lookup(&self, parent: &InodeData, name: &CStr) -> io::Result<Entry>696 fn do_lookup(&self, parent: &InodeData, name: &CStr) -> io::Result<Entry> {
697 let st = statat(parent, name)?;
698
699 let mut flags = libc::O_RDONLY | libc::O_NOFOLLOW | libc::O_CLOEXEC;
700 match FileType::from(st.st_mode) {
701 FileType::Regular => {}
702 FileType::Directory => flags |= libc::O_DIRECTORY,
703 FileType::Other => flags |= libc::O_PATH,
704 }
705
706 // Safe because this doesn't modify any memory and we check the return value.
707 let fd = unsafe { libc::openat(parent.as_raw_descriptor(), name.as_ptr(), flags) };
708 if fd < 0 {
709 return Err(io::Error::last_os_error());
710 }
711
712 // Safe because we just opened this fd.
713 let f = unsafe { File::from_raw_descriptor(fd) };
714
715 Ok(self.add_entry(f, st, flags))
716 }
717
do_open(&self, inode: Inode, flags: u32) -> io::Result<(Option<Handle>, OpenOptions)>718 fn do_open(&self, inode: Inode, flags: u32) -> io::Result<(Option<Handle>, OpenOptions)> {
719 let inode_data = self.find_inode(inode)?;
720
721 let file = Mutex::new(self.open_inode(&inode_data, flags as i32)?);
722
723 let handle = self.next_handle.fetch_add(1, Ordering::Relaxed);
724 let data = HandleData { inode, file };
725
726 self.handles.lock().insert(handle, Arc::new(data));
727
728 let mut opts = OpenOptions::empty();
729 match self.cfg.cache_policy {
730 // We only set the direct I/O option on files.
731 CachePolicy::Never => opts.set(
732 OpenOptions::DIRECT_IO,
733 flags & (libc::O_DIRECTORY as u32) == 0,
734 ),
735 CachePolicy::Always => {
736 opts |= if flags & (libc::O_DIRECTORY as u32) == 0 {
737 OpenOptions::KEEP_CACHE
738 } else {
739 OpenOptions::CACHE_DIR
740 }
741 }
742 _ => {}
743 };
744
745 Ok((Some(handle), opts))
746 }
747
do_release(&self, inode: Inode, handle: Handle) -> io::Result<()>748 fn do_release(&self, inode: Inode, handle: Handle) -> io::Result<()> {
749 let mut handles = self.handles.lock();
750
751 if let btree_map::Entry::Occupied(e) = handles.entry(handle) {
752 if e.get().inode == inode {
753 // We don't need to close the file here because that will happen automatically when
754 // the last `Arc` is dropped.
755 e.remove();
756 return Ok(());
757 }
758 }
759
760 Err(ebadf())
761 }
762
do_getattr(&self, inode: &InodeData) -> io::Result<(libc::stat64, Duration)>763 fn do_getattr(&self, inode: &InodeData) -> io::Result<(libc::stat64, Duration)> {
764 let st = stat(inode)?;
765
766 Ok((st, self.cfg.attr_timeout))
767 }
768
do_unlink(&self, parent: &InodeData, name: &CStr, flags: libc::c_int) -> io::Result<()>769 fn do_unlink(&self, parent: &InodeData, name: &CStr, flags: libc::c_int) -> io::Result<()> {
770 // Safe because this doesn't modify any memory and we check the return value.
771 let res = unsafe { libc::unlinkat(parent.as_raw_descriptor(), name.as_ptr(), flags) };
772 if res == 0 {
773 Ok(())
774 } else {
775 Err(io::Error::last_os_error())
776 }
777 }
778
do_fsync<F: AsRawDescriptor>(&self, file: &F, datasync: bool) -> io::Result<()>779 fn do_fsync<F: AsRawDescriptor>(&self, file: &F, datasync: bool) -> io::Result<()> {
780 // Safe because this doesn't modify any memory and we check the return value.
781 let res = unsafe {
782 if datasync {
783 libc::fdatasync(file.as_raw_descriptor())
784 } else {
785 libc::fsync(file.as_raw_descriptor())
786 }
787 };
788
789 if res == 0 {
790 Ok(())
791 } else {
792 Err(io::Error::last_os_error())
793 }
794 }
795
796 // Changes the CWD to `self.proc`, runs `f`, and then changes the CWD back to the root
797 // directory. This effectively emulates an *at syscall starting at /proc, which is useful when
798 // there is no *at syscall available. Panics if any of the fchdir calls fail or if there is no
799 // root inode.
with_proc_chdir<F, T>(&self, f: F) -> T where F: FnOnce() -> T,800 fn with_proc_chdir<F, T>(&self, f: F) -> T
801 where
802 F: FnOnce() -> T,
803 {
804 let root = self.find_inode(ROOT_ID).expect("failed to find root inode");
805 let chdir_lock = self.chdir_mutex.lock();
806
807 // Safe because this doesn't modify any memory and we check the return value. Since the
808 // fchdir should never fail we just use debug_asserts.
809 let proc_cwd = unsafe { libc::fchdir(self.proc.as_raw_descriptor()) };
810 debug_assert_eq!(
811 proc_cwd,
812 0,
813 "failed to fchdir to /proc: {}",
814 io::Error::last_os_error()
815 );
816
817 let res = f();
818
819 // Safe because this doesn't modify any memory and we check the return value. Since the
820 // fchdir should never fail we just use debug_asserts.
821 let root_cwd = unsafe { libc::fchdir(root.as_raw_descriptor()) };
822 debug_assert_eq!(
823 root_cwd,
824 0,
825 "failed to fchdir back to root directory: {}",
826 io::Error::last_os_error()
827 );
828
829 mem::drop(chdir_lock);
830 res
831 }
832
do_getxattr(&self, inode: &InodeData, name: &CStr, value: &mut [u8]) -> io::Result<usize>833 fn do_getxattr(&self, inode: &InodeData, name: &CStr, value: &mut [u8]) -> io::Result<usize> {
834 let res = if inode.filetype == FileType::Other {
835 // For non-regular files and directories, we cannot open the fd normally. Instead we
836 // emulate an _at syscall by changing the CWD to /proc, running the path based syscall,
837 // and then setting the CWD back to the root directory.
838 let path = CString::new(format!("self/fd/{}", inode.as_raw_descriptor()))
839 .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
840
841 // Safe because this will only modify `value` and we check the return value.
842 self.with_proc_chdir(|| unsafe {
843 libc::getxattr(
844 path.as_ptr(),
845 name.as_ptr(),
846 value.as_mut_ptr() as *mut libc::c_void,
847 value.len() as libc::size_t,
848 )
849 })
850 } else {
851 // For regular files and directories, we can just use fgetxattr. Safe because this will
852 // only write to `value` and we check the return value.
853 unsafe {
854 libc::fgetxattr(
855 inode.as_raw_descriptor(),
856 name.as_ptr(),
857 value.as_mut_ptr() as *mut libc::c_void,
858 value.len() as libc::size_t,
859 )
860 }
861 };
862
863 if res < 0 {
864 Err(io::Error::last_os_error())
865 } else {
866 Ok(res as usize)
867 }
868 }
869
get_encryption_policy_ex<R: io::Read>( &self, inode: Inode, handle: Handle, mut r: R, ) -> io::Result<IoctlReply>870 fn get_encryption_policy_ex<R: io::Read>(
871 &self,
872 inode: Inode,
873 handle: Handle,
874 mut r: R,
875 ) -> io::Result<IoctlReply> {
876 let data: Arc<dyn AsRawDescriptor> = if self.zero_message_open.load(Ordering::Relaxed) {
877 self.find_inode(inode)?
878 } else {
879 self.find_handle(handle, inode)?
880 };
881
882 // Safe because this only has integer fields.
883 let mut arg = unsafe { MaybeUninit::<fscrypt_get_policy_ex_arg>::zeroed().assume_init() };
884 r.read_exact(arg.policy_size.as_mut_slice())?;
885
886 let policy_size = cmp::min(arg.policy_size, size_of::<fscrypt_policy>() as u64);
887 arg.policy_size = policy_size;
888
889 // Safe because the kernel will only write to `arg` and we check the return value.
890 let res =
891 unsafe { ioctl_with_mut_ptr(&*data, FS_IOC_GET_ENCRYPTION_POLICY_EX(), &mut arg) };
892 if res < 0 {
893 Ok(IoctlReply::Done(Err(io::Error::last_os_error())))
894 } else {
895 let len = size_of::<u64>() + arg.policy_size as usize;
896 Ok(IoctlReply::Done(Ok(arg.as_slice()[..len].to_vec())))
897 }
898 }
899
get_fsxattr(&self, inode: Inode, handle: Handle) -> io::Result<IoctlReply>900 fn get_fsxattr(&self, inode: Inode, handle: Handle) -> io::Result<IoctlReply> {
901 let data: Arc<dyn AsRawDescriptor> = if self.zero_message_open.load(Ordering::Relaxed) {
902 self.find_inode(inode)?
903 } else {
904 self.find_handle(handle, inode)?
905 };
906
907 let mut buf = MaybeUninit::<fsxattr>::zeroed();
908
909 // Safe because the kernel will only write to `buf` and we check the return value.
910 let res = unsafe { ioctl_with_mut_ptr(&*data, FS_IOC_FSGETXATTR(), buf.as_mut_ptr()) };
911 if res < 0 {
912 Ok(IoctlReply::Done(Err(io::Error::last_os_error())))
913 } else {
914 // Safe because the kernel guarantees that the policy is now initialized.
915 let xattr = unsafe { buf.assume_init() };
916 Ok(IoctlReply::Done(Ok(xattr.as_slice().to_vec())))
917 }
918 }
919
set_fsxattr<R: io::Read>( &self, inode: Inode, handle: Handle, r: R, ) -> io::Result<IoctlReply>920 fn set_fsxattr<R: io::Read>(
921 &self,
922 inode: Inode,
923 handle: Handle,
924 r: R,
925 ) -> io::Result<IoctlReply> {
926 let data: Arc<dyn AsRawDescriptor> = if self.zero_message_open.load(Ordering::Relaxed) {
927 self.find_inode(inode)?
928 } else {
929 self.find_handle(handle, inode)?
930 };
931
932 let attr = fsxattr::from_reader(r)?;
933
934 // Safe because this doesn't modify any memory and we check the return value.
935 let res = unsafe { ioctl_with_ptr(&*data, FS_IOC_FSSETXATTR(), &attr) };
936 if res < 0 {
937 Ok(IoctlReply::Done(Err(io::Error::last_os_error())))
938 } else {
939 Ok(IoctlReply::Done(Ok(Vec::new())))
940 }
941 }
942
get_flags(&self, inode: Inode, handle: Handle) -> io::Result<IoctlReply>943 fn get_flags(&self, inode: Inode, handle: Handle) -> io::Result<IoctlReply> {
944 let data: Arc<dyn AsRawDescriptor> = if self.zero_message_open.load(Ordering::Relaxed) {
945 self.find_inode(inode)?
946 } else {
947 self.find_handle(handle, inode)?
948 };
949
950 // The ioctl encoding is a long but the parameter is actually an int.
951 let mut flags: c_int = 0;
952
953 // Safe because the kernel will only write to `flags` and we check the return value.
954 let res = unsafe { ioctl_with_mut_ptr(&*data, FS_IOC_GETFLAGS(), &mut flags) };
955 if res < 0 {
956 Ok(IoctlReply::Done(Err(io::Error::last_os_error())))
957 } else {
958 Ok(IoctlReply::Done(Ok(flags.to_ne_bytes().to_vec())))
959 }
960 }
961
set_flags<R: io::Read>(&self, inode: Inode, handle: Handle, r: R) -> io::Result<IoctlReply>962 fn set_flags<R: io::Read>(&self, inode: Inode, handle: Handle, r: R) -> io::Result<IoctlReply> {
963 let data: Arc<dyn AsRawDescriptor> = if self.zero_message_open.load(Ordering::Relaxed) {
964 self.find_inode(inode)?
965 } else {
966 self.find_handle(handle, inode)?
967 };
968
969 // The ioctl encoding is a long but the parameter is actually an int.
970 let flags = c_int::from_reader(r)?;
971
972 // Safe because this doesn't modify any memory and we check the return value.
973 let res = unsafe { ioctl_with_ptr(&*data, FS_IOC_SETFLAGS(), &flags) };
974 if res < 0 {
975 Ok(IoctlReply::Done(Err(io::Error::last_os_error())))
976 } else {
977 Ok(IoctlReply::Done(Ok(Vec::new())))
978 }
979 }
980 }
981
forget_one( inodes: &mut MultikeyBTreeMap<Inode, InodeAltKey, Arc<InodeData>>, inode: Inode, count: u64, )982 fn forget_one(
983 inodes: &mut MultikeyBTreeMap<Inode, InodeAltKey, Arc<InodeData>>,
984 inode: Inode,
985 count: u64,
986 ) {
987 if let Some(data) = inodes.get(&inode) {
988 // Acquiring the write lock on the inode map prevents new lookups from incrementing the
989 // refcount but there is the possibility that a previous lookup already acquired a
990 // reference to the inode data and is in the process of updating the refcount so we need
991 // to loop here until we can decrement successfully.
992 loop {
993 let refcount = data.refcount.load(Ordering::Relaxed);
994
995 // Saturating sub because it doesn't make sense for a refcount to go below zero and
996 // we don't want misbehaving clients to cause integer overflow.
997 let new_count = refcount.saturating_sub(count);
998
999 // Synchronizes with the acquire load in `do_lookup`.
1000 if data
1001 .refcount
1002 .compare_exchange_weak(refcount, new_count, Ordering::Release, Ordering::Relaxed)
1003 .is_ok()
1004 {
1005 if new_count == 0 {
1006 // We just removed the last refcount for this inode. There's no need for an
1007 // acquire fence here because we hold a write lock on the inode map and any
1008 // thread that is waiting to do a forget on the same inode will have to wait
1009 // until we release the lock. So there's is no other release store for us to
1010 // synchronize with before deleting the entry.
1011 inodes.remove(&inode);
1012 }
1013 break;
1014 }
1015 }
1016 }
1017 }
1018
1019 // Strips any `user.virtiofs.` prefix from `buf`. If buf contains one or more nul-bytes, each
1020 // nul-byte-separated slice is treated as a C string and the prefix is stripped from each one.
strip_xattr_prefix(buf: &mut Vec<u8>)1021 fn strip_xattr_prefix(buf: &mut Vec<u8>) {
1022 fn next_cstr(b: &[u8], start: usize) -> Option<&[u8]> {
1023 if start >= b.len() {
1024 return None;
1025 }
1026
1027 let end = b[start..]
1028 .iter()
1029 .position(|&c| c == b'\0')
1030 .map(|p| start + p + 1)
1031 .unwrap_or(b.len());
1032
1033 Some(&b[start..end])
1034 }
1035
1036 let mut pos = 0;
1037 while let Some(name) = next_cstr(&buf, pos) {
1038 if !name.starts_with(USER_VIRTIOFS_XATTR) {
1039 pos += name.len();
1040 continue;
1041 }
1042
1043 let newlen = name.len() - USER_VIRTIOFS_XATTR.len();
1044 buf.drain(pos..pos + USER_VIRTIOFS_XATTR.len());
1045 pos += newlen;
1046 }
1047 }
1048
1049 impl FileSystem for PassthroughFs {
1050 type Inode = Inode;
1051 type Handle = Handle;
1052 type DirIter = ReadDir<Box<[u8]>>;
1053
init(&self, capable: FsOptions) -> io::Result<FsOptions>1054 fn init(&self, capable: FsOptions) -> io::Result<FsOptions> {
1055 // Safe because this is a constant value and a valid C string.
1056 let root = unsafe { CStr::from_bytes_with_nul_unchecked(ROOT_CSTR) };
1057
1058 let flags = libc::O_DIRECTORY | libc::O_NOFOLLOW | libc::O_CLOEXEC;
1059 // Safe because this doesn't modify any memory and we check the return value.
1060 let raw_descriptor = unsafe { libc::openat(libc::AT_FDCWD, root.as_ptr(), flags) };
1061 if raw_descriptor < 0 {
1062 return Err(io::Error::last_os_error());
1063 }
1064
1065 // Safe because we just opened this descriptor above.
1066 let f = unsafe { File::from_raw_descriptor(raw_descriptor) };
1067
1068 let st = stat(&f)?;
1069
1070 // Safe because this doesn't modify any memory and there is no need to check the return
1071 // value because this system call always succeeds. We need to clear the umask here because
1072 // we want the client to be able to set all the bits in the mode.
1073 unsafe { libc::umask(0o000) };
1074
1075 let mut inodes = self.inodes.lock();
1076
1077 // Not sure why the root inode gets a refcount of 2 but that's what libfuse does.
1078 inodes.insert(
1079 ROOT_ID,
1080 InodeAltKey {
1081 ino: st.st_ino,
1082 dev: st.st_dev,
1083 },
1084 Arc::new(InodeData {
1085 inode: ROOT_ID,
1086 file: Mutex::new((f, flags)),
1087 refcount: AtomicU64::new(2),
1088 filetype: st.st_mode.into(),
1089 }),
1090 );
1091
1092 let mut opts = FsOptions::DO_READDIRPLUS
1093 | FsOptions::READDIRPLUS_AUTO
1094 | FsOptions::EXPORT_SUPPORT
1095 | FsOptions::DONT_MASK
1096 | FsOptions::POSIX_ACL;
1097 if self.cfg.writeback && capable.contains(FsOptions::WRITEBACK_CACHE) {
1098 opts |= FsOptions::WRITEBACK_CACHE;
1099 self.writeback.store(true, Ordering::Relaxed);
1100 }
1101 if self.cfg.cache_policy == CachePolicy::Always {
1102 if capable.contains(FsOptions::ZERO_MESSAGE_OPEN) {
1103 opts |= FsOptions::ZERO_MESSAGE_OPEN;
1104 self.zero_message_open.store(true, Ordering::Relaxed);
1105 }
1106 if capable.contains(FsOptions::ZERO_MESSAGE_OPENDIR) {
1107 opts |= FsOptions::ZERO_MESSAGE_OPENDIR;
1108 self.zero_message_opendir.store(true, Ordering::Relaxed);
1109 }
1110 }
1111 Ok(opts)
1112 }
1113
destroy(&self)1114 fn destroy(&self) {
1115 self.handles.lock().clear();
1116 self.inodes.lock().clear();
1117 }
1118
statfs(&self, _ctx: Context, inode: Inode) -> io::Result<libc::statvfs64>1119 fn statfs(&self, _ctx: Context, inode: Inode) -> io::Result<libc::statvfs64> {
1120 let data = self.find_inode(inode)?;
1121
1122 let mut out = MaybeUninit::<libc::statvfs64>::zeroed();
1123
1124 // Safe because this will only modify `out` and we check the return value.
1125 let res = unsafe { libc::fstatvfs64(data.as_raw_descriptor(), out.as_mut_ptr()) };
1126 if res == 0 {
1127 // Safe because the kernel guarantees that `out` has been initialized.
1128 Ok(unsafe { out.assume_init() })
1129 } else {
1130 Err(io::Error::last_os_error())
1131 }
1132 }
1133
lookup(&self, _ctx: Context, parent: Inode, name: &CStr) -> io::Result<Entry>1134 fn lookup(&self, _ctx: Context, parent: Inode, name: &CStr) -> io::Result<Entry> {
1135 let data = self.find_inode(parent)?;
1136 self.do_lookup(&data, name).or_else(|e| {
1137 if self.cfg.ascii_casefold {
1138 self.ascii_casefold_lookup(&data, name.to_bytes())
1139 } else {
1140 Err(e)
1141 }
1142 })
1143 }
1144
forget(&self, _ctx: Context, inode: Inode, count: u64)1145 fn forget(&self, _ctx: Context, inode: Inode, count: u64) {
1146 let mut inodes = self.inodes.lock();
1147
1148 forget_one(&mut inodes, inode, count)
1149 }
1150
batch_forget(&self, _ctx: Context, requests: Vec<(Inode, u64)>)1151 fn batch_forget(&self, _ctx: Context, requests: Vec<(Inode, u64)>) {
1152 let mut inodes = self.inodes.lock();
1153
1154 for (inode, count) in requests {
1155 forget_one(&mut inodes, inode, count)
1156 }
1157 }
1158
opendir( &self, _ctx: Context, inode: Inode, flags: u32, ) -> io::Result<(Option<Handle>, OpenOptions)>1159 fn opendir(
1160 &self,
1161 _ctx: Context,
1162 inode: Inode,
1163 flags: u32,
1164 ) -> io::Result<(Option<Handle>, OpenOptions)> {
1165 if self.zero_message_opendir.load(Ordering::Relaxed) {
1166 Err(io::Error::from_raw_os_error(libc::ENOSYS))
1167 } else {
1168 self.do_open(inode, flags | (libc::O_DIRECTORY as u32))
1169 }
1170 }
1171
releasedir( &self, _ctx: Context, inode: Inode, _flags: u32, handle: Handle, ) -> io::Result<()>1172 fn releasedir(
1173 &self,
1174 _ctx: Context,
1175 inode: Inode,
1176 _flags: u32,
1177 handle: Handle,
1178 ) -> io::Result<()> {
1179 if self.zero_message_opendir.load(Ordering::Relaxed) {
1180 Ok(())
1181 } else {
1182 self.do_release(inode, handle)
1183 }
1184 }
1185
mkdir( &self, ctx: Context, parent: Inode, name: &CStr, mode: u32, umask: u32, ) -> io::Result<Entry>1186 fn mkdir(
1187 &self,
1188 ctx: Context,
1189 parent: Inode,
1190 name: &CStr,
1191 mode: u32,
1192 umask: u32,
1193 ) -> io::Result<Entry> {
1194 let data = self.find_inode(parent)?;
1195
1196 let (_uid, _gid) = set_creds(ctx.uid, ctx.gid)?;
1197 let res = {
1198 let mut um = self.umask.lock();
1199 let _scoped_umask = um.set(umask);
1200
1201 // Safe because this doesn't modify any memory and we check the return value.
1202 unsafe { libc::mkdirat(data.as_raw_descriptor(), name.as_ptr(), mode) }
1203 };
1204 if res == 0 {
1205 self.do_lookup(&data, name)
1206 } else {
1207 Err(io::Error::last_os_error())
1208 }
1209 }
1210
rmdir(&self, _ctx: Context, parent: Inode, name: &CStr) -> io::Result<()>1211 fn rmdir(&self, _ctx: Context, parent: Inode, name: &CStr) -> io::Result<()> {
1212 let data = self.find_inode(parent)?;
1213 self.do_unlink(&data, name, libc::AT_REMOVEDIR)
1214 }
1215
readdir( &self, _ctx: Context, inode: Inode, handle: Handle, size: u32, offset: u64, ) -> io::Result<Self::DirIter>1216 fn readdir(
1217 &self,
1218 _ctx: Context,
1219 inode: Inode,
1220 handle: Handle,
1221 size: u32,
1222 offset: u64,
1223 ) -> io::Result<Self::DirIter> {
1224 let buf = vec![0; size as usize].into_boxed_slice();
1225
1226 if self.zero_message_opendir.load(Ordering::Relaxed) {
1227 let data = self.find_inode(inode)?;
1228 ReadDir::new(&*data, offset as libc::off64_t, buf)
1229 } else {
1230 let data = self.find_handle(handle, inode)?;
1231
1232 let dir = data.file.lock();
1233
1234 ReadDir::new(&*dir, offset as libc::off64_t, buf)
1235 }
1236 }
1237
open( &self, _ctx: Context, inode: Inode, flags: u32, ) -> io::Result<(Option<Handle>, OpenOptions)>1238 fn open(
1239 &self,
1240 _ctx: Context,
1241 inode: Inode,
1242 flags: u32,
1243 ) -> io::Result<(Option<Handle>, OpenOptions)> {
1244 if self.zero_message_open.load(Ordering::Relaxed) {
1245 Err(io::Error::from_raw_os_error(libc::ENOSYS))
1246 } else {
1247 self.do_open(inode, flags)
1248 }
1249 }
1250
release( &self, _ctx: Context, inode: Inode, _flags: u32, handle: Handle, _flush: bool, _flock_release: bool, _lock_owner: Option<u64>, ) -> io::Result<()>1251 fn release(
1252 &self,
1253 _ctx: Context,
1254 inode: Inode,
1255 _flags: u32,
1256 handle: Handle,
1257 _flush: bool,
1258 _flock_release: bool,
1259 _lock_owner: Option<u64>,
1260 ) -> io::Result<()> {
1261 if self.zero_message_open.load(Ordering::Relaxed) {
1262 Ok(())
1263 } else {
1264 self.do_release(inode, handle)
1265 }
1266 }
1267
chromeos_tmpfile( &self, ctx: Context, parent: Self::Inode, mode: u32, umask: u32, ) -> io::Result<Entry>1268 fn chromeos_tmpfile(
1269 &self,
1270 ctx: Context,
1271 parent: Self::Inode,
1272 mode: u32,
1273 umask: u32,
1274 ) -> io::Result<Entry> {
1275 let data = self.find_inode(parent)?;
1276
1277 let (_uid, _gid) = set_creds(ctx.uid, ctx.gid)?;
1278
1279 let tmpflags = libc::O_RDWR | libc::O_TMPFILE | libc::O_CLOEXEC | libc::O_NOFOLLOW;
1280
1281 // Safe because this is a valid c string.
1282 let current_dir = unsafe { CStr::from_bytes_with_nul_unchecked(b".\0") };
1283
1284 let fd = {
1285 let mut um = self.umask.lock();
1286 let _scoped_umask = um.set(umask);
1287
1288 // Safe because this doesn't modify any memory and we check the return value.
1289 unsafe {
1290 libc::openat(
1291 data.as_raw_descriptor(),
1292 current_dir.as_ptr(),
1293 tmpflags,
1294 mode,
1295 )
1296 }
1297 };
1298 if fd < 0 {
1299 return Err(io::Error::last_os_error());
1300 }
1301
1302 // Safe because we just opened this fd.
1303 let tmpfile = unsafe { File::from_raw_descriptor(fd) };
1304
1305 let st = stat(&tmpfile)?;
1306 Ok(self.add_entry(tmpfile, st, tmpflags))
1307 }
1308
create( &self, ctx: Context, parent: Inode, name: &CStr, mode: u32, flags: u32, umask: u32, ) -> io::Result<(Entry, Option<Handle>, OpenOptions)>1309 fn create(
1310 &self,
1311 ctx: Context,
1312 parent: Inode,
1313 name: &CStr,
1314 mode: u32,
1315 flags: u32,
1316 umask: u32,
1317 ) -> io::Result<(Entry, Option<Handle>, OpenOptions)> {
1318 let data = self.find_inode(parent)?;
1319
1320 let (_uid, _gid) = set_creds(ctx.uid, ctx.gid)?;
1321
1322 let create_flags =
1323 (flags as i32 | libc::O_CREAT | libc::O_CLOEXEC | libc::O_NOFOLLOW) & !libc::O_DIRECT;
1324
1325 let fd = {
1326 let mut um = self.umask.lock();
1327 let _scoped_umask = um.set(umask);
1328
1329 // Safe because this doesn't modify any memory and we check the return value. We don't
1330 // really check `flags` because if the kernel can't handle poorly specified flags then
1331 // we have much bigger problems.
1332 unsafe { libc::openat(data.as_raw_descriptor(), name.as_ptr(), create_flags, mode) }
1333 };
1334 if fd < 0 {
1335 return Err(io::Error::last_os_error());
1336 }
1337
1338 // Safe because we just opened this fd.
1339 let file = unsafe { File::from_raw_descriptor(fd) };
1340
1341 let st = stat(&file)?;
1342 let entry = self.add_entry(file, st, create_flags);
1343
1344 let (handle, opts) = if self.zero_message_open.load(Ordering::Relaxed) {
1345 (None, OpenOptions::KEEP_CACHE)
1346 } else {
1347 self.do_open(
1348 entry.inode,
1349 flags & !((libc::O_CREAT | libc::O_EXCL | libc::O_NOCTTY) as u32),
1350 )
1351 .map_err(|e| {
1352 // Don't leak the entry.
1353 self.forget(ctx, entry.inode, 1);
1354 e
1355 })?
1356 };
1357
1358 Ok((entry, handle, opts))
1359 }
1360
unlink(&self, _ctx: Context, parent: Inode, name: &CStr) -> io::Result<()>1361 fn unlink(&self, _ctx: Context, parent: Inode, name: &CStr) -> io::Result<()> {
1362 let data = self.find_inode(parent)?;
1363 self.do_unlink(&data, name, 0)
1364 }
1365
read<W: io::Write + ZeroCopyWriter>( &self, _ctx: Context, inode: Inode, handle: Handle, mut w: W, size: u32, offset: u64, _lock_owner: Option<u64>, _flags: u32, ) -> io::Result<usize>1366 fn read<W: io::Write + ZeroCopyWriter>(
1367 &self,
1368 _ctx: Context,
1369 inode: Inode,
1370 handle: Handle,
1371 mut w: W,
1372 size: u32,
1373 offset: u64,
1374 _lock_owner: Option<u64>,
1375 _flags: u32,
1376 ) -> io::Result<usize> {
1377 if self.zero_message_open.load(Ordering::Relaxed) {
1378 let data = self.find_inode(inode)?;
1379
1380 let mut file = data.file.lock();
1381 let mut flags = file.1;
1382 match flags & libc::O_ACCMODE {
1383 libc::O_WRONLY => {
1384 flags &= !libc::O_WRONLY;
1385 flags |= libc::O_RDWR;
1386
1387 // We need to get a readable handle for this file.
1388 let newfile = self.open_fd(file.0.as_raw_descriptor(), libc::O_RDWR)?;
1389 *file = (newfile, flags);
1390 }
1391 libc::O_RDONLY | libc::O_RDWR => {}
1392 _ => panic!("Unexpected flags: {:#x}", flags),
1393 }
1394
1395 w.write_from(&mut file.0, size as usize, offset)
1396 } else {
1397 let data = self.find_handle(handle, inode)?;
1398
1399 let mut f = data.file.lock();
1400 w.write_from(&mut f, size as usize, offset)
1401 }
1402 }
1403
write<R: io::Read + ZeroCopyReader>( &self, _ctx: Context, inode: Inode, handle: Handle, mut r: R, size: u32, offset: u64, _lock_owner: Option<u64>, _delayed_write: bool, flags: u32, ) -> io::Result<usize>1404 fn write<R: io::Read + ZeroCopyReader>(
1405 &self,
1406 _ctx: Context,
1407 inode: Inode,
1408 handle: Handle,
1409 mut r: R,
1410 size: u32,
1411 offset: u64,
1412 _lock_owner: Option<u64>,
1413 _delayed_write: bool,
1414 flags: u32,
1415 ) -> io::Result<usize> {
1416 // When the WRITE_KILL_PRIV flag is set, drop CAP_FSETID so that the kernel will
1417 // automatically clear the setuid and setgid bits for us.
1418 let _fsetid = if flags & WRITE_KILL_PRIV != 0 {
1419 Some(drop_cap_fsetid()?)
1420 } else {
1421 None
1422 };
1423
1424 if self.zero_message_open.load(Ordering::Relaxed) {
1425 let data = self.find_inode(inode)?;
1426
1427 let mut file = data.file.lock();
1428 let mut flags = file.1;
1429 match flags & libc::O_ACCMODE {
1430 libc::O_RDONLY => {
1431 flags &= !libc::O_RDONLY;
1432 flags |= libc::O_RDWR;
1433
1434 // We need to get a writable handle for this file.
1435 let newfile = self.open_fd(file.0.as_raw_descriptor(), libc::O_RDWR)?;
1436 *file = (newfile, flags);
1437 }
1438 libc::O_WRONLY | libc::O_RDWR => {}
1439 _ => panic!("Unexpected flags: {:#x}", flags),
1440 }
1441
1442 r.read_to(&mut file.0, size as usize, offset)
1443 } else {
1444 let data = self.find_handle(handle, inode)?;
1445
1446 let mut f = data.file.lock();
1447 r.read_to(&mut f, size as usize, offset)
1448 }
1449 }
1450
getattr( &self, _ctx: Context, inode: Inode, _handle: Option<Handle>, ) -> io::Result<(libc::stat64, Duration)>1451 fn getattr(
1452 &self,
1453 _ctx: Context,
1454 inode: Inode,
1455 _handle: Option<Handle>,
1456 ) -> io::Result<(libc::stat64, Duration)> {
1457 let data = self.find_inode(inode)?;
1458 self.do_getattr(&data)
1459 }
1460
setattr( &self, _ctx: Context, inode: Inode, attr: libc::stat64, handle: Option<Handle>, valid: SetattrValid, ) -> io::Result<(libc::stat64, Duration)>1461 fn setattr(
1462 &self,
1463 _ctx: Context,
1464 inode: Inode,
1465 attr: libc::stat64,
1466 handle: Option<Handle>,
1467 valid: SetattrValid,
1468 ) -> io::Result<(libc::stat64, Duration)> {
1469 let inode_data = self.find_inode(inode)?;
1470
1471 enum Data {
1472 Handle(Arc<HandleData>, RawDescriptor),
1473 ProcPath(CString),
1474 }
1475
1476 // If we have a handle then use it otherwise get a new fd from the inode.
1477 let data = if let Some(handle) = handle.filter(|&h| h != 0) {
1478 let hd = self.find_handle(handle, inode)?;
1479
1480 let fd = hd.file.lock().as_raw_descriptor();
1481 Data::Handle(hd, fd)
1482 } else {
1483 let pathname = CString::new(format!("self/fd/{}", inode_data.as_raw_descriptor()))
1484 .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
1485 Data::ProcPath(pathname)
1486 };
1487
1488 if valid.contains(SetattrValid::MODE) {
1489 // Safe because this doesn't modify any memory and we check the return value.
1490 let res = unsafe {
1491 match data {
1492 Data::Handle(_, fd) => libc::fchmod(fd, attr.st_mode),
1493 Data::ProcPath(ref p) => {
1494 libc::fchmodat(self.proc.as_raw_descriptor(), p.as_ptr(), attr.st_mode, 0)
1495 }
1496 }
1497 };
1498 if res < 0 {
1499 return Err(io::Error::last_os_error());
1500 }
1501 }
1502
1503 if valid.intersects(SetattrValid::UID | SetattrValid::GID) {
1504 let uid = if valid.contains(SetattrValid::UID) {
1505 attr.st_uid
1506 } else {
1507 // Cannot use -1 here because these are unsigned values.
1508 ::std::u32::MAX
1509 };
1510 let gid = if valid.contains(SetattrValid::GID) {
1511 attr.st_gid
1512 } else {
1513 // Cannot use -1 here because these are unsigned values.
1514 ::std::u32::MAX
1515 };
1516
1517 // Safe because this is a constant value and a valid C string.
1518 let empty = unsafe { CStr::from_bytes_with_nul_unchecked(EMPTY_CSTR) };
1519
1520 // Safe because this doesn't modify any memory and we check the return value.
1521 let res = unsafe {
1522 libc::fchownat(
1523 inode_data.as_raw_descriptor(),
1524 empty.as_ptr(),
1525 uid,
1526 gid,
1527 libc::AT_EMPTY_PATH | libc::AT_SYMLINK_NOFOLLOW,
1528 )
1529 };
1530 if res < 0 {
1531 return Err(io::Error::last_os_error());
1532 }
1533 }
1534
1535 if valid.contains(SetattrValid::SIZE) {
1536 // Safe because this doesn't modify any memory and we check the return value.
1537 let res = match data {
1538 Data::Handle(_, fd) => unsafe { libc::ftruncate64(fd, attr.st_size) },
1539 _ => {
1540 // There is no `ftruncateat` so we need to get a new fd and truncate it.
1541 let f = self.open_inode(&inode_data, libc::O_NONBLOCK | libc::O_RDWR)?;
1542 unsafe { libc::ftruncate64(f.as_raw_descriptor(), attr.st_size) }
1543 }
1544 };
1545 if res < 0 {
1546 return Err(io::Error::last_os_error());
1547 }
1548 }
1549
1550 if valid.intersects(SetattrValid::ATIME | SetattrValid::MTIME) {
1551 let mut tvs = [
1552 libc::timespec {
1553 tv_sec: 0,
1554 tv_nsec: libc::UTIME_OMIT,
1555 },
1556 libc::timespec {
1557 tv_sec: 0,
1558 tv_nsec: libc::UTIME_OMIT,
1559 },
1560 ];
1561
1562 if valid.contains(SetattrValid::ATIME_NOW) {
1563 tvs[0].tv_nsec = libc::UTIME_NOW;
1564 } else if valid.contains(SetattrValid::ATIME) {
1565 tvs[0].tv_sec = attr.st_atime;
1566 tvs[0].tv_nsec = attr.st_atime_nsec;
1567 }
1568
1569 if valid.contains(SetattrValid::MTIME_NOW) {
1570 tvs[1].tv_nsec = libc::UTIME_NOW;
1571 } else if valid.contains(SetattrValid::MTIME) {
1572 tvs[1].tv_sec = attr.st_mtime;
1573 tvs[1].tv_nsec = attr.st_mtime_nsec;
1574 }
1575
1576 // Safe because this doesn't modify any memory and we check the return value.
1577 let res = match data {
1578 Data::Handle(_, fd) => unsafe { libc::futimens(fd, tvs.as_ptr()) },
1579 Data::ProcPath(ref p) => unsafe {
1580 libc::utimensat(self.proc.as_raw_descriptor(), p.as_ptr(), tvs.as_ptr(), 0)
1581 },
1582 };
1583 if res < 0 {
1584 return Err(io::Error::last_os_error());
1585 }
1586 }
1587
1588 self.do_getattr(&inode_data)
1589 }
1590
rename( &self, _ctx: Context, olddir: Inode, oldname: &CStr, newdir: Inode, newname: &CStr, flags: u32, ) -> io::Result<()>1591 fn rename(
1592 &self,
1593 _ctx: Context,
1594 olddir: Inode,
1595 oldname: &CStr,
1596 newdir: Inode,
1597 newname: &CStr,
1598 flags: u32,
1599 ) -> io::Result<()> {
1600 let old_inode = self.find_inode(olddir)?;
1601 let new_inode = self.find_inode(newdir)?;
1602
1603 // Safe because this doesn't modify any memory and we check the return value.
1604 // TODO: Switch to libc::renameat2 once https://github.com/rust-lang/libc/pull/1508 lands
1605 // and we have glibc 2.28.
1606 let res = unsafe {
1607 libc::syscall(
1608 libc::SYS_renameat2,
1609 old_inode.as_raw_descriptor(),
1610 oldname.as_ptr(),
1611 new_inode.as_raw_descriptor(),
1612 newname.as_ptr(),
1613 flags,
1614 )
1615 };
1616 if res == 0 {
1617 Ok(())
1618 } else {
1619 Err(io::Error::last_os_error())
1620 }
1621 }
1622
mknod( &self, ctx: Context, parent: Inode, name: &CStr, mode: u32, rdev: u32, umask: u32, ) -> io::Result<Entry>1623 fn mknod(
1624 &self,
1625 ctx: Context,
1626 parent: Inode,
1627 name: &CStr,
1628 mode: u32,
1629 rdev: u32,
1630 umask: u32,
1631 ) -> io::Result<Entry> {
1632 let data = self.find_inode(parent)?;
1633
1634 let (_uid, _gid) = set_creds(ctx.uid, ctx.gid)?;
1635
1636 let res = {
1637 let mut um = self.umask.lock();
1638 let _scoped_umask = um.set(umask);
1639
1640 // Safe because this doesn't modify any memory and we check the return value.
1641 unsafe {
1642 libc::mknodat(
1643 data.as_raw_descriptor(),
1644 name.as_ptr(),
1645 mode as libc::mode_t,
1646 rdev as libc::dev_t,
1647 )
1648 }
1649 };
1650
1651 if res < 0 {
1652 Err(io::Error::last_os_error())
1653 } else {
1654 self.do_lookup(&data, name)
1655 }
1656 }
1657
link( &self, _ctx: Context, inode: Inode, newparent: Inode, newname: &CStr, ) -> io::Result<Entry>1658 fn link(
1659 &self,
1660 _ctx: Context,
1661 inode: Inode,
1662 newparent: Inode,
1663 newname: &CStr,
1664 ) -> io::Result<Entry> {
1665 let data = self.find_inode(inode)?;
1666 let new_inode = self.find_inode(newparent)?;
1667
1668 let path = CString::new(format!("self/fd/{}", data.as_raw_descriptor()))
1669 .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
1670
1671 // Safe because this doesn't modify any memory and we check the return value.
1672 let res = unsafe {
1673 libc::linkat(
1674 self.proc.as_raw_descriptor(),
1675 path.as_ptr(),
1676 new_inode.as_raw_descriptor(),
1677 newname.as_ptr(),
1678 libc::AT_SYMLINK_FOLLOW,
1679 )
1680 };
1681 if res == 0 {
1682 self.do_lookup(&new_inode, newname)
1683 } else {
1684 Err(io::Error::last_os_error())
1685 }
1686 }
1687
symlink( &self, ctx: Context, linkname: &CStr, parent: Inode, name: &CStr, ) -> io::Result<Entry>1688 fn symlink(
1689 &self,
1690 ctx: Context,
1691 linkname: &CStr,
1692 parent: Inode,
1693 name: &CStr,
1694 ) -> io::Result<Entry> {
1695 let data = self.find_inode(parent)?;
1696
1697 let (_uid, _gid) = set_creds(ctx.uid, ctx.gid)?;
1698
1699 // Safe because this doesn't modify any memory and we check the return value.
1700 let res =
1701 unsafe { libc::symlinkat(linkname.as_ptr(), data.as_raw_descriptor(), name.as_ptr()) };
1702 if res == 0 {
1703 self.do_lookup(&data, name)
1704 } else {
1705 Err(io::Error::last_os_error())
1706 }
1707 }
1708
readlink(&self, _ctx: Context, inode: Inode) -> io::Result<Vec<u8>>1709 fn readlink(&self, _ctx: Context, inode: Inode) -> io::Result<Vec<u8>> {
1710 let data = self.find_inode(inode)?;
1711
1712 let mut buf = vec![0; libc::PATH_MAX as usize];
1713
1714 // Safe because this is a constant value and a valid C string.
1715 let empty = unsafe { CStr::from_bytes_with_nul_unchecked(EMPTY_CSTR) };
1716
1717 // Safe because this will only modify the contents of `buf` and we check the return value.
1718 let res = unsafe {
1719 libc::readlinkat(
1720 data.as_raw_descriptor(),
1721 empty.as_ptr(),
1722 buf.as_mut_ptr() as *mut libc::c_char,
1723 buf.len(),
1724 )
1725 };
1726 if res < 0 {
1727 return Err(io::Error::last_os_error());
1728 }
1729
1730 buf.resize(res as usize, 0);
1731 Ok(buf)
1732 }
1733
flush( &self, _ctx: Context, inode: Inode, handle: Handle, _lock_owner: u64, ) -> io::Result<()>1734 fn flush(
1735 &self,
1736 _ctx: Context,
1737 inode: Inode,
1738 handle: Handle,
1739 _lock_owner: u64,
1740 ) -> io::Result<()> {
1741 let data: Arc<dyn AsRawDescriptor> = if self.zero_message_open.load(Ordering::Relaxed) {
1742 self.find_inode(inode)?
1743 } else {
1744 self.find_handle(handle, inode)?
1745 };
1746
1747 // Since this method is called whenever an fd is closed in the client, we can emulate that
1748 // behavior by doing the same thing (dup-ing the fd and then immediately closing it). Safe
1749 // because this doesn't modify any memory and we check the return values.
1750 unsafe {
1751 let newfd = libc::fcntl(data.as_raw_descriptor(), libc::F_DUPFD_CLOEXEC, 0);
1752
1753 if newfd < 0 {
1754 return Err(io::Error::last_os_error());
1755 }
1756
1757 if libc::close(newfd) < 0 {
1758 Err(io::Error::last_os_error())
1759 } else {
1760 Ok(())
1761 }
1762 }
1763 }
1764
fsync(&self, _ctx: Context, inode: Inode, datasync: bool, handle: Handle) -> io::Result<()>1765 fn fsync(&self, _ctx: Context, inode: Inode, datasync: bool, handle: Handle) -> io::Result<()> {
1766 if self.zero_message_open.load(Ordering::Relaxed) {
1767 let data = self.find_inode(inode)?;
1768 self.do_fsync(&*data, datasync)
1769 } else {
1770 let data = self.find_handle(handle, inode)?;
1771
1772 let file = data.file.lock();
1773 self.do_fsync(&*file, datasync)
1774 }
1775 }
1776
fsyncdir( &self, _ctx: Context, inode: Inode, datasync: bool, handle: Handle, ) -> io::Result<()>1777 fn fsyncdir(
1778 &self,
1779 _ctx: Context,
1780 inode: Inode,
1781 datasync: bool,
1782 handle: Handle,
1783 ) -> io::Result<()> {
1784 if self.zero_message_opendir.load(Ordering::Relaxed) {
1785 let data = self.find_inode(inode)?;
1786 self.do_fsync(&*data, datasync)
1787 } else {
1788 let data = self.find_handle(handle, inode)?;
1789
1790 let file = data.file.lock();
1791 self.do_fsync(&*file, datasync)
1792 }
1793 }
1794
access(&self, ctx: Context, inode: Inode, mask: u32) -> io::Result<()>1795 fn access(&self, ctx: Context, inode: Inode, mask: u32) -> io::Result<()> {
1796 let data = self.find_inode(inode)?;
1797
1798 let st = stat(&*data)?;
1799 let mode = mask as i32 & (libc::R_OK | libc::W_OK | libc::X_OK);
1800
1801 if mode == libc::F_OK {
1802 // The file exists since we were able to call `stat(2)` on it.
1803 return Ok(());
1804 }
1805
1806 if (mode & libc::R_OK) != 0 {
1807 if ctx.uid != 0
1808 && (st.st_uid != ctx.uid || st.st_mode & 0o400 == 0)
1809 && (st.st_gid != ctx.gid || st.st_mode & 0o040 == 0)
1810 && st.st_mode & 0o004 == 0
1811 {
1812 return Err(io::Error::from_raw_os_error(libc::EACCES));
1813 }
1814 }
1815
1816 if (mode & libc::W_OK) != 0 {
1817 if ctx.uid != 0
1818 && (st.st_uid != ctx.uid || st.st_mode & 0o200 == 0)
1819 && (st.st_gid != ctx.gid || st.st_mode & 0o020 == 0)
1820 && st.st_mode & 0o002 == 0
1821 {
1822 return Err(io::Error::from_raw_os_error(libc::EACCES));
1823 }
1824 }
1825
1826 // root can only execute something if it is executable by one of the owner, the group, or
1827 // everyone.
1828 if (mode & libc::X_OK) != 0 {
1829 if (ctx.uid != 0 || st.st_mode & 0o111 == 0)
1830 && (st.st_uid != ctx.uid || st.st_mode & 0o100 == 0)
1831 && (st.st_gid != ctx.gid || st.st_mode & 0o010 == 0)
1832 && st.st_mode & 0o001 == 0
1833 {
1834 return Err(io::Error::from_raw_os_error(libc::EACCES));
1835 }
1836 }
1837
1838 Ok(())
1839 }
1840
setxattr( &self, _ctx: Context, inode: Inode, name: &CStr, value: &[u8], flags: u32, ) -> io::Result<()>1841 fn setxattr(
1842 &self,
1843 _ctx: Context,
1844 inode: Inode,
1845 name: &CStr,
1846 value: &[u8],
1847 flags: u32,
1848 ) -> io::Result<()> {
1849 // We can't allow the VM to set this xattr because an unprivileged process may use it to set
1850 // a privileged xattr.
1851 if self.cfg.rewrite_security_xattrs && name.to_bytes().starts_with(USER_VIRTIOFS_XATTR) {
1852 return Err(io::Error::from_raw_os_error(libc::EPERM));
1853 }
1854
1855 let data = self.find_inode(inode)?;
1856 let name = self.rewrite_xattr_name(name);
1857
1858 let res = if data.filetype == FileType::Other {
1859 // For non-regular files and directories, we cannot open the fd normally. Instead we
1860 // emulate an _at syscall by changing the CWD to /proc, running the path based syscall,
1861 // and then setting the CWD back to the root directory.
1862 let path = CString::new(format!("self/fd/{}", data.as_raw_descriptor()))
1863 .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
1864
1865 // Safe because this doesn't modify any memory and we check the return value.
1866 self.with_proc_chdir(|| unsafe {
1867 libc::setxattr(
1868 path.as_ptr(),
1869 name.as_ptr(),
1870 value.as_ptr() as *const libc::c_void,
1871 value.len() as libc::size_t,
1872 flags as c_int,
1873 )
1874 })
1875 } else {
1876 // For regular files and directories, we can just use fsetxattr. Safe because this
1877 // doesn't modify any memory and we check the return value.
1878 unsafe {
1879 libc::fsetxattr(
1880 data.as_raw_descriptor(),
1881 name.as_ptr(),
1882 value.as_ptr() as *const libc::c_void,
1883 value.len() as libc::size_t,
1884 flags as c_int,
1885 )
1886 }
1887 };
1888
1889 if res < 0 {
1890 Err(io::Error::last_os_error())
1891 } else {
1892 Ok(())
1893 }
1894 }
1895
getxattr( &self, _ctx: Context, inode: Inode, name: &CStr, size: u32, ) -> io::Result<GetxattrReply>1896 fn getxattr(
1897 &self,
1898 _ctx: Context,
1899 inode: Inode,
1900 name: &CStr,
1901 size: u32,
1902 ) -> io::Result<GetxattrReply> {
1903 // We don't allow the VM to set this xattr so we also pretend there is no value associated
1904 // with it.
1905 if self.cfg.rewrite_security_xattrs && name.to_bytes().starts_with(USER_VIRTIOFS_XATTR) {
1906 return Err(io::Error::from_raw_os_error(libc::ENODATA));
1907 }
1908
1909 let data = self.find_inode(inode)?;
1910 let name = self.rewrite_xattr_name(name);
1911 let mut buf = vec![0u8; size as usize];
1912
1913 // Safe because this will only modify the contents of `buf`.
1914 let res = self.do_getxattr(&data, &name, &mut buf[..])?;
1915 if size == 0 {
1916 Ok(GetxattrReply::Count(res as u32))
1917 } else {
1918 buf.truncate(res as usize);
1919 Ok(GetxattrReply::Value(buf))
1920 }
1921 }
1922
listxattr(&self, _ctx: Context, inode: Inode, size: u32) -> io::Result<ListxattrReply>1923 fn listxattr(&self, _ctx: Context, inode: Inode, size: u32) -> io::Result<ListxattrReply> {
1924 let data = self.find_inode(inode)?;
1925
1926 let mut buf = vec![0u8; size as usize];
1927
1928 let res = if data.filetype == FileType::Other {
1929 // For non-regular files and directories, we cannot open the fd normally. Instead we
1930 // emulate an _at syscall by changing the CWD to /proc, running the path based syscall,
1931 // and then setting the CWD back to the root directory.
1932 let path = CString::new(format!("self/fd/{}", data.as_raw_descriptor()))
1933 .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
1934
1935 // Safe because this will only modify `buf` and we check the return value.
1936 self.with_proc_chdir(|| unsafe {
1937 libc::listxattr(
1938 path.as_ptr(),
1939 buf.as_mut_ptr() as *mut libc::c_char,
1940 buf.len() as libc::size_t,
1941 )
1942 })
1943 } else {
1944 // For regular files and directories, we can just flistxattr. Safe because this will only
1945 // write to `buf` and we check the return value.
1946 unsafe {
1947 libc::flistxattr(
1948 data.as_raw_descriptor(),
1949 buf.as_mut_ptr() as *mut libc::c_char,
1950 buf.len() as libc::size_t,
1951 )
1952 }
1953 };
1954
1955 if res < 0 {
1956 return Err(io::Error::last_os_error());
1957 }
1958
1959 if size == 0 {
1960 Ok(ListxattrReply::Count(res as u32))
1961 } else {
1962 buf.truncate(res as usize);
1963
1964 if self.cfg.rewrite_security_xattrs {
1965 strip_xattr_prefix(&mut buf);
1966 }
1967 Ok(ListxattrReply::Names(buf))
1968 }
1969 }
1970
removexattr(&self, _ctx: Context, inode: Inode, name: &CStr) -> io::Result<()>1971 fn removexattr(&self, _ctx: Context, inode: Inode, name: &CStr) -> io::Result<()> {
1972 // We don't allow the VM to set this xattr so we also pretend there is no value associated
1973 // with it.
1974 if self.cfg.rewrite_security_xattrs && name.to_bytes().starts_with(USER_VIRTIOFS_XATTR) {
1975 return Err(io::Error::from_raw_os_error(libc::ENODATA));
1976 }
1977
1978 let data = self.find_inode(inode)?;
1979 let name = self.rewrite_xattr_name(name);
1980
1981 let res = if data.filetype == FileType::Other {
1982 // For non-regular files and directories, we cannot open the fd normally. Instead we
1983 // emulate an _at syscall by changing the CWD to /proc, running the path based syscall,
1984 // and then setting the CWD back to the root directory.
1985 let path = CString::new(format!("self/fd/{}", data.as_raw_descriptor()))
1986 .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
1987
1988 // Safe because this doesn't modify any memory and we check the return value.
1989 self.with_proc_chdir(|| unsafe { libc::removexattr(path.as_ptr(), name.as_ptr()) })
1990 } else {
1991 // For regular files and directories, we can just use fremovexattr. Safe because this
1992 // doesn't modify any memory and we check the return value.
1993 unsafe { libc::fremovexattr(data.as_raw_descriptor(), name.as_ptr()) }
1994 };
1995
1996 if res == 0 {
1997 Ok(())
1998 } else {
1999 Err(io::Error::last_os_error())
2000 }
2001 }
2002
fallocate( &self, _ctx: Context, inode: Inode, handle: Handle, mode: u32, offset: u64, length: u64, ) -> io::Result<()>2003 fn fallocate(
2004 &self,
2005 _ctx: Context,
2006 inode: Inode,
2007 handle: Handle,
2008 mode: u32,
2009 offset: u64,
2010 length: u64,
2011 ) -> io::Result<()> {
2012 let data: Arc<dyn AsRawDescriptor> = if self.zero_message_open.load(Ordering::Relaxed) {
2013 let data = self.find_inode(inode)?;
2014
2015 {
2016 // fallocate needs a writable fd
2017 let mut file = data.file.lock();
2018 let mut flags = file.1;
2019 match flags & libc::O_ACCMODE {
2020 libc::O_RDONLY => {
2021 flags &= !libc::O_RDONLY;
2022 flags |= libc::O_RDWR;
2023
2024 // We need to get a writable handle for this file.
2025 let newfile = self.open_fd(file.0.as_raw_descriptor(), libc::O_RDWR)?;
2026 *file = (newfile, flags);
2027 }
2028 libc::O_WRONLY | libc::O_RDWR => {}
2029 _ => panic!("Unexpected flags: {:#x}", flags),
2030 }
2031 }
2032
2033 data
2034 } else {
2035 self.find_handle(handle, inode)?
2036 };
2037
2038 let fd = data.as_raw_descriptor();
2039 // Safe because this doesn't modify any memory and we check the return value.
2040 let res = unsafe {
2041 libc::fallocate64(
2042 fd,
2043 mode as libc::c_int,
2044 offset as libc::off64_t,
2045 length as libc::off64_t,
2046 )
2047 };
2048 if res == 0 {
2049 Ok(())
2050 } else {
2051 Err(io::Error::last_os_error())
2052 }
2053 }
2054
ioctl<R: io::Read>( &self, _ctx: Context, inode: Inode, handle: Handle, _flags: IoctlFlags, cmd: u32, _arg: u64, in_size: u32, out_size: u32, r: R, ) -> io::Result<IoctlReply>2055 fn ioctl<R: io::Read>(
2056 &self,
2057 _ctx: Context,
2058 inode: Inode,
2059 handle: Handle,
2060 _flags: IoctlFlags,
2061 cmd: u32,
2062 _arg: u64,
2063 in_size: u32,
2064 out_size: u32,
2065 r: R,
2066 ) -> io::Result<IoctlReply> {
2067 const GET_ENCRYPTION_POLICY_EX: u32 = FS_IOC_GET_ENCRYPTION_POLICY_EX() as u32;
2068 const GET_FSXATTR: u32 = FS_IOC_FSGETXATTR() as u32;
2069 const SET_FSXATTR: u32 = FS_IOC_FSSETXATTR() as u32;
2070 const GET_FLAGS32: u32 = FS_IOC32_GETFLAGS() as u32;
2071 const SET_FLAGS32: u32 = FS_IOC32_SETFLAGS() as u32;
2072 const GET_FLAGS64: u32 = FS_IOC64_GETFLAGS() as u32;
2073 const SET_FLAGS64: u32 = FS_IOC64_SETFLAGS() as u32;
2074
2075 match cmd {
2076 GET_ENCRYPTION_POLICY_EX => self.get_encryption_policy_ex(inode, handle, r),
2077 GET_FSXATTR => {
2078 if out_size < size_of::<fsxattr>() as u32 {
2079 Err(io::Error::from_raw_os_error(libc::ENOMEM))
2080 } else {
2081 self.get_fsxattr(inode, handle)
2082 }
2083 }
2084 SET_FSXATTR => {
2085 if in_size < size_of::<fsxattr>() as u32 {
2086 Err(io::Error::from_raw_os_error(libc::EINVAL))
2087 } else {
2088 self.set_fsxattr(inode, handle, r)
2089 }
2090 }
2091 GET_FLAGS32 | GET_FLAGS64 => {
2092 if out_size < size_of::<c_int>() as u32 {
2093 Err(io::Error::from_raw_os_error(libc::ENOMEM))
2094 } else {
2095 self.get_flags(inode, handle)
2096 }
2097 }
2098 SET_FLAGS32 | SET_FLAGS64 => {
2099 if in_size < size_of::<c_int>() as u32 {
2100 Err(io::Error::from_raw_os_error(libc::ENOMEM))
2101 } else {
2102 self.set_flags(inode, handle, r)
2103 }
2104 }
2105 _ => Err(io::Error::from_raw_os_error(libc::ENOTTY)),
2106 }
2107 }
2108
copy_file_range( &self, ctx: Context, inode_src: Inode, handle_src: Handle, offset_src: u64, inode_dst: Inode, handle_dst: Handle, offset_dst: u64, length: u64, flags: u64, ) -> io::Result<usize>2109 fn copy_file_range(
2110 &self,
2111 ctx: Context,
2112 inode_src: Inode,
2113 handle_src: Handle,
2114 offset_src: u64,
2115 inode_dst: Inode,
2116 handle_dst: Handle,
2117 offset_dst: u64,
2118 length: u64,
2119 flags: u64,
2120 ) -> io::Result<usize> {
2121 // We need to change credentials during a write so that the kernel will remove setuid or
2122 // setgid bits from the file if it was written to by someone other than the owner.
2123 let (_uid, _gid) = set_creds(ctx.uid, ctx.gid)?;
2124 let (src_data, dst_data): (Arc<dyn AsRawDescriptor>, Arc<dyn AsRawDescriptor>) =
2125 if self.zero_message_open.load(Ordering::Relaxed) {
2126 (self.find_inode(inode_src)?, self.find_inode(inode_dst)?)
2127 } else {
2128 (
2129 self.find_handle(handle_src, inode_src)?,
2130 self.find_handle(handle_dst, inode_dst)?,
2131 )
2132 };
2133
2134 let src = src_data.as_raw_descriptor();
2135 let dst = dst_data.as_raw_descriptor();
2136
2137 let res = unsafe {
2138 libc::syscall(
2139 libc::SYS_copy_file_range,
2140 src,
2141 &offset_src,
2142 dst,
2143 &offset_dst,
2144 length,
2145 flags,
2146 )
2147 };
2148
2149 if res >= 0 {
2150 Ok(res as usize)
2151 } else {
2152 Err(io::Error::last_os_error())
2153 }
2154 }
2155
set_up_mapping<M: Mapper>( &self, _ctx: Context, inode: Self::Inode, _handle: Self::Handle, file_offset: u64, mem_offset: u64, size: usize, prot: u32, mapper: M, ) -> io::Result<()>2156 fn set_up_mapping<M: Mapper>(
2157 &self,
2158 _ctx: Context,
2159 inode: Self::Inode,
2160 _handle: Self::Handle,
2161 file_offset: u64,
2162 mem_offset: u64,
2163 size: usize,
2164 prot: u32,
2165 mapper: M,
2166 ) -> io::Result<()> {
2167 let read = prot & libc::PROT_READ as u32 != 0;
2168 let write = prot & libc::PROT_WRITE as u32 != 0;
2169 let mmap_flags = match (read, write) {
2170 (true, true) => libc::O_RDWR,
2171 (true, false) => libc::O_RDONLY,
2172 (false, true) => libc::O_RDWR, // mmap always requires an fd opened for reading.
2173 (false, false) => return Err(io::Error::from_raw_os_error(libc::EINVAL)),
2174 };
2175
2176 let data = self.find_inode(inode)?;
2177
2178 if self.zero_message_open.load(Ordering::Relaxed) {
2179 let mut file = data.file.lock();
2180 let mut open_flags = file.1;
2181 match (mmap_flags, open_flags & libc::O_ACCMODE) {
2182 (libc::O_RDONLY, libc::O_WRONLY)
2183 | (libc::O_RDWR, libc::O_RDONLY)
2184 | (libc::O_RDWR, libc::O_WRONLY) => {
2185 // We have a read-only or write-only fd and we need to upgrade it.
2186 open_flags &= !libc::O_ACCMODE;
2187 open_flags |= libc::O_RDWR;
2188
2189 let newfile = self.open_fd(file.0.as_raw_descriptor(), libc::O_RDWR)?;
2190 *file = (newfile, open_flags);
2191 }
2192 (libc::O_RDONLY, libc::O_RDONLY)
2193 | (libc::O_RDONLY, libc::O_RDWR)
2194 | (libc::O_RDWR, libc::O_RDWR) => {}
2195 (m, o) => panic!(
2196 "Unexpected combination of access flags: ({:#x}, {:#x})",
2197 m, o
2198 ),
2199 }
2200 mapper.map(mem_offset, size, &file.0, file_offset, prot)
2201 } else {
2202 let file = self.open_inode(&data, mmap_flags | libc::O_NONBLOCK)?;
2203 mapper.map(mem_offset, size, &file, file_offset, prot)
2204 }
2205 }
2206
remove_mapping<M: Mapper>(&self, msgs: &[RemoveMappingOne], mapper: M) -> io::Result<()>2207 fn remove_mapping<M: Mapper>(&self, msgs: &[RemoveMappingOne], mapper: M) -> io::Result<()> {
2208 for RemoveMappingOne { moffset, len } in msgs {
2209 mapper.unmap(*moffset, *len)?;
2210 }
2211 Ok(())
2212 }
2213 }
2214
2215 #[cfg(test)]
2216 mod tests {
2217 use super::*;
2218
2219 #[test]
rewrite_xattr_names()2220 fn rewrite_xattr_names() {
2221 let cfg = Config {
2222 rewrite_security_xattrs: true,
2223 ..Default::default()
2224 };
2225
2226 let p = PassthroughFs::new(cfg).expect("Failed to create PassthroughFs");
2227
2228 // Selinux shouldn't get overwritten.
2229 let selinux = unsafe { CStr::from_bytes_with_nul_unchecked(b"security.selinux\0") };
2230 assert_eq!(p.rewrite_xattr_name(selinux).to_bytes(), selinux.to_bytes());
2231
2232 // user, trusted, and system should not be changed either.
2233 let user = unsafe { CStr::from_bytes_with_nul_unchecked(b"user.foobar\0") };
2234 assert_eq!(p.rewrite_xattr_name(user).to_bytes(), user.to_bytes());
2235 let trusted = unsafe { CStr::from_bytes_with_nul_unchecked(b"trusted.foobar\0") };
2236 assert_eq!(p.rewrite_xattr_name(trusted).to_bytes(), trusted.to_bytes());
2237 let system = unsafe { CStr::from_bytes_with_nul_unchecked(b"system.foobar\0") };
2238 assert_eq!(p.rewrite_xattr_name(system).to_bytes(), system.to_bytes());
2239
2240 // sehash should be re-written.
2241 let sehash = unsafe { CStr::from_bytes_with_nul_unchecked(b"security.sehash\0") };
2242 assert_eq!(
2243 p.rewrite_xattr_name(sehash).to_bytes(),
2244 b"user.virtiofs.security.sehash"
2245 );
2246 }
2247
2248 #[test]
strip_xattr_names()2249 fn strip_xattr_names() {
2250 let only_nuls = b"\0\0\0\0\0";
2251 let mut actual = only_nuls.to_vec();
2252 strip_xattr_prefix(&mut actual);
2253 assert_eq!(&actual[..], &only_nuls[..]);
2254
2255 let no_nuls = b"security.sehashuser.virtiofs";
2256 let mut actual = no_nuls.to_vec();
2257 strip_xattr_prefix(&mut actual);
2258 assert_eq!(&actual[..], &no_nuls[..]);
2259
2260 let empty = b"";
2261 let mut actual = empty.to_vec();
2262 strip_xattr_prefix(&mut actual);
2263 assert_eq!(&actual[..], &empty[..]);
2264
2265 let no_strippable_names = b"security.selinux\0user.foobar\0system.test\0";
2266 let mut actual = no_strippable_names.to_vec();
2267 strip_xattr_prefix(&mut actual);
2268 assert_eq!(&actual[..], &no_strippable_names[..]);
2269
2270 let only_strippable_names = b"user.virtiofs.security.sehash\0user.virtiofs.security.wtf\0";
2271 let mut actual = only_strippable_names.to_vec();
2272 strip_xattr_prefix(&mut actual);
2273 assert_eq!(&actual[..], b"security.sehash\0security.wtf\0");
2274
2275 let mixed_names = b"user.virtiofs.security.sehash\0security.selinux\0user.virtiofs.security.wtf\0user.foobar\0";
2276 let mut actual = mixed_names.to_vec();
2277 strip_xattr_prefix(&mut actual);
2278 let expected = b"security.sehash\0security.selinux\0security.wtf\0user.foobar\0";
2279 assert_eq!(&actual[..], &expected[..]);
2280
2281 let no_nul_with_prefix = b"user.virtiofs.security.sehash";
2282 let mut actual = no_nul_with_prefix.to_vec();
2283 strip_xattr_prefix(&mut actual);
2284 assert_eq!(&actual[..], b"security.sehash");
2285 }
2286 }
2287